未验证 提交 796499fd 编写于 作者: H huangjiyi 提交者: GitHub

move device_memory_aligment from fluid to phi (#48694)

上级 89bd4011
...@@ -92,8 +92,7 @@ if(WITH_GPU) ...@@ -92,8 +92,7 @@ if(WITH_GPU)
memory memory
dynload_cuda dynload_cuda
variable_visitor variable_visitor
place place)
device_memory_aligment)
nv_library( nv_library(
grad_merge_all_reduce_op_handle grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc SRCS grad_merge_all_reduce_op_handle.cc
...@@ -105,7 +104,6 @@ if(WITH_GPU) ...@@ -105,7 +104,6 @@ if(WITH_GPU)
dynload_cuda dynload_cuda
variable_visitor variable_visitor
place place
device_memory_aligment
all_reduce_op_handle all_reduce_op_handle
fused_all_reduce_op_handle) fused_all_reduce_op_handle)
...@@ -170,8 +168,7 @@ elseif(WITH_ROCM) ...@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
memory memory
dynload_cuda dynload_cuda
variable_visitor variable_visitor
place place)
device_memory_aligment)
hip_library( hip_library(
grad_merge_all_reduce_op_handle grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc SRCS grad_merge_all_reduce_op_handle.cc
...@@ -183,7 +180,6 @@ elseif(WITH_ROCM) ...@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
dynload_cuda dynload_cuda
variable_visitor variable_visitor
place place
device_memory_aligment
all_reduce_op_handle all_reduce_op_handle
fused_all_reduce_op_handle) fused_all_reduce_op_handle)
...@@ -233,8 +229,7 @@ else() ...@@ -233,8 +229,7 @@ else()
ddim ddim
memory memory
variable_visitor variable_visitor
place place)
device_memory_aligment)
cc_library( cc_library(
grad_merge_all_reduce_op_handle grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc SRCS grad_merge_all_reduce_op_handle.cc
...@@ -245,7 +240,6 @@ else() ...@@ -245,7 +240,6 @@ else()
memory memory
variable_visitor variable_visitor
place place
device_memory_aligment
all_reduce_op_handle all_reduce_op_handle
fused_all_reduce_op_handle) fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/device_memory_aligment.h"
DEFINE_bool(skip_fused_all_reduce_check, false, ""); DEFINE_bool(skip_fused_all_reduce_check, false, "");
DECLARE_bool(allreduce_record_one_event); DECLARE_bool(allreduce_record_one_event);
...@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
for (size_t k = 1; k < g_tensor.size(); ++k) { for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data(); const void *cur_address = g_tensor.at(k - 1).second->data();
int64_t len = g_tensor.at(k - 1).second->numel(); int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = platform::Alignment(len * size_of_dtype, places_[0]); auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>( void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset); reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data(); const void *next_address = g_tensor.at(k).second->data();
...@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
"The size of grad tensors of fused_all_reduce_op_handle " "The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.", "must be > 0, but got %d.",
len)); len));
*numel += *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
} }
......
...@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_ ...@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
lod_tensor maxouting unpooling pooling lod_rank_table context_project lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling executor device_memory_aligment generator) sequence_pooling executor generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
...@@ -167,7 +167,6 @@ if(WITH_XPU) ...@@ -167,7 +167,6 @@ if(WITH_XPU)
cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
endif() endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
...@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy( framework::TensorCopy(
*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); *in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
offset += use_align ? platform::Alignment(len * size_of_dtype, offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(), context.GetPlace(),
align_size) / align_size) /
size_of_dtype size_of_dtype
: len; : len;
} }
...@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy( framework::TensorCopy(
*out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); *out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
} }
offset += use_align ? platform::Alignment(len * size_of_dtype, offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(), context.GetPlace(),
align_size) / align_size) /
size_of_dtype size_of_dtype
: len; : len;
} }
...@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len))) static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim); .Resize(dim);
len = use_align len = use_align
? platform::Alignment( ? phi::Alignment(
len * size_of_dtype, context.GetPlace(), align_size) / len * size_of_dtype, context.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
...@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
0, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i])); "The number of tensor `%s`'s elements is 0.", var_names[i]));
auto len = use_align ? platform::Alignment( auto len = use_align
static_cast<size_t>(size) * size_of_dtype, ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
place, place,
align_size) / align_size) /
size_of_dtype size_of_dtype
: static_cast<size_t>(size); : static_cast<size_t>(size);
const void *ptr = const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len; VLOG(4) << size << " " << len;
......
...@@ -378,10 +378,6 @@ if(WITH_GPU) ...@@ -378,10 +378,6 @@ if(WITH_GPU)
stats stats
op_proto_maker op_proto_maker
shape_inference) shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_library( hip_library(
profiler profiler
...@@ -394,10 +390,6 @@ elseif(WITH_ROCM) ...@@ -394,10 +390,6 @@ elseif(WITH_ROCM)
stats stats
op_proto_maker op_proto_maker
shape_inference) shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
else() else()
cc_library( cc_library(
profiler profiler
...@@ -409,10 +401,6 @@ else() ...@@ -409,10 +401,6 @@ else()
stats stats
op_proto_maker op_proto_maker
shape_inference) shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info place)
endif() endif()
cc_test( cc_test(
......
...@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() { ...@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
} }
size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
size_t CpuMaxChunkSize() { size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory, // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb. // or the initial_cpu_memory_in_mb.
......
...@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize(); ...@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine. //! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize(); size_t CUDAPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator. using phi::backends::cpu::CpuMinChunkSize;
size_t CpuMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize(); size_t CpuMaxChunkSize();
......
...@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } ...@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t GpuMaxChunkSize() { size_t GpuMaxChunkSize() {
size_t max_chunk_size = GpuMaxAllocSize(); size_t max_chunk_size = GpuMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
...@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { ...@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size, size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags, unsigned long long flags, // NOLINT
int dev_id) { // NOLINT int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate( return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags); handle, size, prop, flags);
} }
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -81,8 +82,7 @@ size_t GpuInitAllocSize(); ...@@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device. //! Get the re-allocation size of current GPU device.
size_t GpuReallocSize(); size_t GpuReallocSize();
//! Get the minimum chunk size for GPU buddy allocator. using phi::backends::gpu::GpuMinChunkSize;
size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator. //! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize(); size_t GpuMaxChunkSize();
...@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError(); ...@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size, size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags, unsigned long long flags, // NOLINT
int dev_id); // NOLINT int dev_id);
//! cuMemRelease with recorded info //! cuMemRelease with recorded info
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
......
...@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); } ...@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }
size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); } size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }
size_t MLUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t MLUMaxChunkSize() { size_t MLUMaxChunkSize() {
size_t max_chunk_size = MLUMaxAllocSize(); size_t max_chunk_size = MLUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include <cncl.h> #include <cncl.h>
#endif #endif
#include <vector> #include <vector>
#include "paddle/phi/backends/mlu/mlu_info.h"
namespace paddle { namespace paddle {
...@@ -89,8 +90,7 @@ size_t MLUInitAllocSize(); ...@@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
//! Get the re-allocation size of current MLU device. //! Get the re-allocation size of current MLU device.
size_t MLUReallocSize(); size_t MLUReallocSize();
//! Get the minimum chunk size for MLU buddy allocator. using phi::backends::mlu::MLUMinChunkSize;
size_t MLUMinChunkSize();
//! Get the maximum chunk size for MLU buddy allocator. //! Get the maximum chunk size for MLU buddy allocator.
size_t MLUMaxChunkSize(); size_t MLUMaxChunkSize();
......
...@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); } ...@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); } size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
size_t NPUMinChunkSize() {
// NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
// though no document specify that explicitly.
// See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
// details.
return 1 << 9;
}
size_t NPUMaxChunkSize() { size_t NPUMaxChunkSize() {
size_t max_chunk_size = NPUMaxAllocSize(); size_t max_chunk_size = NPUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "acl/acl.h" #include "acl/acl.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/phi/backends/npu/npu_info.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -69,8 +70,7 @@ size_t NPUInitAllocSize(); ...@@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
//! Get the re-allocation size of current NPU device. //! Get the re-allocation size of current NPU device.
size_t NPUReallocSize(); size_t NPUReallocSize();
//! Get the minimum chunk size for NPU buddy allocator. using phi::backends::npu::NPUMinChunkSize;
size_t NPUMinChunkSize();
//! Get the maximum chunk size for NPU buddy allocator. //! Get the maximum chunk size for NPU buddy allocator.
size_t NPUMaxChunkSize(); size_t NPUMaxChunkSize();
......
...@@ -39,6 +39,13 @@ ...@@ -39,6 +39,13 @@
namespace phi { namespace phi {
namespace backends { namespace backends {
namespace cpu { namespace cpu {
//! Get the minimum chunk size for buddy allocator.
inline size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
...@@ -51,6 +58,7 @@ typedef enum { ...@@ -51,6 +58,7 @@ typedef enum {
avx512_mic_4ops, avx512_mic_4ops,
avx512_bf16, avx512_bf16,
} cpu_isa_t; // Instruction set architecture } cpu_isa_t; // Instruction set architecture
} // namespace cpu } // namespace cpu
} // namespace backends } // namespace backends
} // namespace phi } // namespace phi
...@@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_memory_aligment.h" #pragma once
#include <stddef.h>
namespace paddle { #include "paddle/phi/backends/cpu/cpu_info.h"
namespace platform { #include "paddle/phi/common/place.h"
size_t Alignment(size_t size, const platform::Place &place, int align_size) { #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/phi/backends/npu/npu_info.h"
#endif
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h"
#endif
namespace phi {
inline size_t Alignment(size_t size,
const phi::Place &place,
int align_size = -1) {
size_t alignment = 0; size_t alignment = 0;
if (align_size > 0) { if (align_size > 0) {
alignment = align_size; alignment = align_size;
} else { } else {
alignment = 1024; alignment = 1024;
if (platform::is_cpu_place(place)) { if (place.GetType() == phi::AllocationType::CPU) {
alignment = CpuMinChunkSize(); alignment = phi::backends::cpu::CpuMinChunkSize();
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize(); alignment = phi::backends::gpu::GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
alignment = alignment; alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize(); alignment = phi::backends::npu::NPUMinChunkSize();
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
alignment = MLUMinChunkSize(); alignment = phi::backends::mlu::MLUMinChunkSize();
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(phi::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA/XPU/NPU/MLU.")); "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
#endif #endif
} }
} }
if (is_npu_place(place)) { if (place.GetType() == phi::AllocationType::NPU) {
size += 32; // required by ascendcl size += 32; // required by ascendcl
} }
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
} }
} // namespace platform
} // namespace paddle } // namespace phi
...@@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id); ...@@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution. //! Set the GPU device id for next execution.
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
//! Get the minimum chunk size for GPU buddy allocator.
inline size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
//! Copy memory from address src to dst asynchronously. //! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, void GpuMemcpyAsync(void *dst,
const void *src, const void *src,
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,22 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,22 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <stddef.h>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle { namespace phi {
namespace platform { namespace backends {
size_t Alignment(size_t size, namespace mlu {
const platform::Place &place,
int align_size = -1); //! Get the minimum chunk size for MLU buddy allocator.
} // namespace platform inline size_t MLUMinChunkSize() {
} // namespace paddle // Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
} // namespace mlu
} // namespace backends
} // namespace phi
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
namespace phi {
namespace backends {
namespace npu {
//! Get the minimum chunk size for NPU buddy allocator.
inline size_t NPUMinChunkSize() {
// NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
// though no document specify that explicitly.
// See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
// details.
return 1 << 9;
}
} // namespace npu
} // namespace backends
} // namespace phi
#endif
...@@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS ...@@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS
fft fft
phi_data_layout_transform phi_data_layout_transform
gpc gpc
utf8proc utf8proc)
device_memory_aligment)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
if(WITH_NCCL OR WITH_RCCL) if(WITH_NCCL OR WITH_RCCL)
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/phi/backends/device_memory_aligment.h"
namespace phi { namespace phi {
...@@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx, ...@@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
input.at(i)->dtype())); input.at(i)->dtype()));
const void *cur_address = input.at(i - 1)->data(); const void *cur_address = input.at(i - 1)->data();
int64_t len = input.at(i - 1)->numel(); int64_t len = input.at(i - 1)->numel();
auto offset = auto offset = phi::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
paddle::platform::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
void *infer_next_address = reinterpret_cast<void *>( void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset); reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = input.at(i)->data(); const void *next_address = input.at(i)->data();
...@@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx, ...@@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
infer_next_address, infer_next_address,
next_address)); next_address));
} }
numel += paddle::platform::Alignment( numel += phi::Alignment((*input.rbegin())->numel() * size_of_dtype,
(*input.rbegin())->numel() * size_of_dtype, dev_ctx.GetPlace()); dev_ctx.GetPlace());
// reset holder, do inplace // reset holder, do inplace
output->ShareBufferWith(*input.at(0)); output->ShareBufferWith(*input.at(0));
output->Resize({numel / size_of_dtype}); output->Resize({numel / size_of_dtype});
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <sstream> #include <sstream>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors, ...@@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
0, 0,
errors::InvalidArgument( errors::InvalidArgument(
"The number of `%d`-th tensor's elements is 0.", i)); "The number of `%d`-th tensor's elements is 0.", i));
auto len = use_align ? paddle::platform::Alignment( auto len = use_align
static_cast<size_t>(size) * size_of_dtype, ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
place, place,
align_size) / align_size) /
size_of_dtype size_of_dtype
: static_cast<size_t>(size); : static_cast<size_t>(size);
const void *ptr = const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len; VLOG(4) << size << " " << len;
...@@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, ...@@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor); phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor);
offset += use_align offset += use_align
? paddle::platform::Alignment( ? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) / len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
...@@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, ...@@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor); phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
} }
offset += use_align offset += use_align
? paddle::platform::Alignment( ? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) / len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
...@@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, ...@@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset), ->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset),
static_cast<int64_t>(offset + len))) static_cast<int64_t>(offset + len)))
.Resize(dim); .Resize(dim);
len = use_align ? paddle::platform::Alignment( len = use_align ? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) / len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype size_of_dtype
: len; : len;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册