move device_memory_aligment from fluid to phi (#48694)

796499fd · huangjiyi · GitHub · 89bd4011 · 796499fd · 796499fd
21 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -92,8 +92,7 @@ if(WITH_GPU)
         memory
         dynload_cuda
         variable_visitor
-         place
+         place)
-         device_memory_aligment)
  nv_library(
    grad_merge_all_reduce_op_handle
    SRCS grad_merge_all_reduce_op_handle.cc
@@ -105,7 +104,6 @@ if(WITH_GPU)
         dynload_cuda
         variable_visitor
         place
-         device_memory_aligment
         all_reduce_op_handle
         fused_all_reduce_op_handle)
@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
         memory
         dynload_cuda
         variable_visitor
-         place
+         place)
-         device_memory_aligment)
  hip_library(
    grad_merge_all_reduce_op_handle
    SRCS grad_merge_all_reduce_op_handle.cc
@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
         dynload_cuda
         variable_visitor
         place
-         device_memory_aligment
         all_reduce_op_handle
         fused_all_reduce_op_handle)
@@ -233,8 +229,7 @@ else()
         ddim
         memory
         variable_visitor
-         place
+         place)
-         device_memory_aligment)
  cc_library(
    grad_merge_all_reduce_op_handle
    SRCS grad_merge_all_reduce_op_handle.cc
@@ -245,7 +240,6 @@ else()
         memory
         variable_visitor
         place
-         device_memory_aligment
         all_reduce_op_handle
         fused_all_reduce_op_handle)
  if(WITH_DISTRIBUTE)

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,9 +16,9 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 DECLARE_bool(allreduce_record_one_event);
@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
    for (size_t k = 1; k < g_tensor.size(); ++k) {
      const void *cur_address = g_tensor.at(k - 1).second->data();
      int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
+      auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
      void *infer_next_address = reinterpret_cast<void *>(
          reinterpret_cast<uintptr_t>(cur_address) + offset);
      const void *next_address = g_tensor.at(k).second->data();
@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
            "The size of grad tensors of fused_all_reduce_op_handle  "
            "must be > 0, but got %d.",
            len));
-    *numel +=
+    *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
-        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
  }
 }

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling executor generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
@@ -167,7 +167,6 @@ if(WITH_XPU)
  cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
 endif()
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
        framework::TensorCopy(
            *in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
+                                             context.GetPlace(),
-                                                  align_size) /
+                                             align_size) /
                                  size_of_dtype
                            : len;
      }
@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
          framework::TensorCopy(
              *out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
        }
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
+                                             context.GetPlace(),
-                                                  align_size) /
+                                             align_size) /
                                  size_of_dtype
                            : len;
      }
@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
              static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
          .Resize(dim);
      len = use_align
-                ? platform::Alignment(
+                ? phi::Alignment(
                      len * size_of_dtype, context.GetPlace(), align_size) /
                      size_of_dtype
                : len;
@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
          0,
          platform::errors::InvalidArgument(
              "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      auto len = use_align ? platform::Alignment(
+      auto len = use_align
-                                 static_cast<size_t>(size) * size_of_dtype,
+                     ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                 place,
+                                      place,
-                                 align_size) /
+                                      align_size) /
-                                 size_of_dtype
+                           size_of_dtype
-                           : static_cast<size_t>(size);
+                     : static_cast<size_t>(size);
      const void *ptr =
          lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
      VLOG(4) << size << " " << len;

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -378,10 +378,6 @@ if(WITH_GPU)
         stats
         op_proto_maker
         shape_inference)
-  nv_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
  hip_library(
    profiler
@@ -394,10 +390,6 @@ elseif(WITH_ROCM)
         stats
         op_proto_maker
         shape_inference)
-  hip_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 else()
  cc_library(
    profiler
@@ -409,10 +401,6 @@ else()
         stats
         op_proto_maker
         shape_inference)
-  cc_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info place)
 endif()
 cc_test(

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
 size_t CpuMaxChunkSize() {
  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
  // or the initial_cpu_memory_in_mb.

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
 //! Get the maximum allocation size for a machine.
 size_t CUDAPinnedMaxAllocSize();
-//! Get the minimum chunk size for buddy allocator.
+using phi::backends::cpu::CpuMinChunkSize;
-size_t CpuMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
 size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
 size_t GpuMaxChunkSize() {
  size_t max_chunk_size = GpuMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                              size_t size,
                              const CUmemAllocationProp *prop,
-                              unsigned long long flags,
+                              unsigned long long flags,  // NOLINT
-                              int dev_id) {  // NOLINT
+                              int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
      handle, size, prop, flags);
 }

--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 namespace paddle {
 namespace platform {
@@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
 //! Get the re-allocation size of current GPU device.
 size_t GpuReallocSize();
-//! Get the minimum chunk size for GPU buddy allocator.
+using phi::backends::gpu::GpuMinChunkSize;
-size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                              size_t size,
                              const CUmemAllocationProp *prop,
-                              unsigned long long flags,
+                              unsigned long long flags,  // NOLINT
-                              int dev_id);  // NOLINT
+                              int dev_id);
 //! cuMemRelease with recorded info
 CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,

--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }
 size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }
-size_t MLUMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
 size_t MLUMaxChunkSize() {
  size_t max_chunk_size = MLUMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";

--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <cncl.h>
 #endif
 #include <vector>
+#include "paddle/phi/backends/mlu/mlu_info.h"
 namespace paddle {
@@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
 //! Get the re-allocation size of current MLU device.
 size_t MLUReallocSize();
-//! Get the minimum chunk size for MLU buddy allocator.
+using phi::backends::mlu::MLUMinChunkSize;
-size_t MLUMinChunkSize();
 //! Get the maximum chunk size for MLU buddy allocator.
 size_t MLUMaxChunkSize();

--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
 size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
-size_t NPUMinChunkSize() {
-  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
-  // though no document specify that explicitly.
-  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
-  // details.
-  return 1 << 9;
-}
 size_t NPUMaxChunkSize() {
  size_t max_chunk_size = NPUMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";

--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "acl/acl.h"
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/phi/backends/npu/npu_info.h"
 namespace paddle {
 namespace platform {
@@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
 //! Get the re-allocation size of current NPU device.
 size_t NPUReallocSize();
-//! Get the minimum chunk size for NPU buddy allocator.
+using phi::backends::npu::NPUMinChunkSize;
-size_t NPUMinChunkSize();
 //! Get the maximum chunk size for NPU buddy allocator.
 size_t NPUMaxChunkSize();

--- a/paddle/phi/backends/cpu/cpu_info.h
+++ b/paddle/phi/backends/cpu/cpu_info.h
@@ -39,6 +39,13 @@
 namespace phi {
 namespace backends {
 namespace cpu {
+//! Get the minimum chunk size for buddy allocator.
+inline size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
 typedef enum {
  isa_any,
  sse42,
@@ -51,6 +58,7 @@ typedef enum {
  avx512_mic_4ops,
  avx512_bf16,
 } cpu_isa_t;  // Instruction set architecture
 }  // namespace cpu
 }  // namespace backends
 }  // namespace phi
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#pragma once
+#include <stddef.h>
-namespace paddle {
+#include "paddle/phi/backends/cpu/cpu_info.h"
-namespace platform {
+#include "paddle/phi/common/place.h"
-size_t Alignment(size_t size, const platform::Place &place, int align_size) {
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/phi/backends/npu/npu_info.h"
+#endif
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/phi/backends/mlu/mlu_info.h"
+#endif
+namespace phi {
+inline size_t Alignment(size_t size,
+                        const phi::Place &place,
+                        int align_size = -1) {
  size_t alignment = 0;
  if (align_size > 0) {
    alignment = align_size;
  } else {
    alignment = 1024;
-    if (platform::is_cpu_place(place)) {
+    if (place.GetType() == phi::AllocationType::CPU) {
-      alignment = CpuMinChunkSize();
+      alignment = phi::backends::cpu::CpuMinChunkSize();
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      alignment = GpuMinChunkSize();
+      alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
      alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
-      alignment = NPUMinChunkSize();
+      alignment = phi::backends::npu::NPUMinChunkSize();
 #elif defined(PADDLE_WITH_MLU)
-      alignment = MLUMinChunkSize();
+      alignment = phi::backends::mlu::MLUMinChunkSize();
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
          "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
 #endif
    }
  }
-  if (is_npu_place(place)) {
+  if (place.GetType() == phi::AllocationType::NPU) {
    size += 32;  // required by ascendcl
  }
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
 }
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
+//! Get the minimum chunk size for GPU buddy allocator.
+inline size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst,
                    const void *src,

--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,22 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <stddef.h>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-namespace paddle {
+namespace phi {
-namespace platform {
+namespace backends {
-size_t Alignment(size_t size,
+namespace mlu {
-                 const platform::Place &place,
-                 int align_size = -1);
+//! Get the minimum chunk size for MLU buddy allocator.
-}  // namespace platform
+inline size_t MLUMinChunkSize() {
-}  // namespace paddle
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+}  // namespace mlu
+}  // namespace backends
+}  // namespace phi
+#endif
--- a/paddle/phi/backends/npu/npu_info.h
+++ b/paddle/phi/backends/npu/npu_info.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_ASCEND_CL
+namespace phi {
+namespace backends {
+namespace npu {
+//! Get the minimum chunk size for NPU buddy allocator.
+inline size_t NPUMinChunkSize() {
+  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
+  // though no document specify that explicitly.
+  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
+  // details.
+  return 1 << 9;
+}
+}  // namespace npu
+}  // namespace backends
+}  // namespace phi
+#endif
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS
    fft
    phi_data_layout_transform
    gpc
-    utf8proc
+    utf8proc)
-    device_memory_aligment)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)

--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 namespace phi {
@@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
            input.at(i)->dtype()));
    const void *cur_address = input.at(i - 1)->data();
    int64_t len = input.at(i - 1)->numel();
-    auto offset =
+    auto offset = phi::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
-        paddle::platform::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
    void *infer_next_address = reinterpret_cast<void *>(
        reinterpret_cast<uintptr_t>(cur_address) + offset);
    const void *next_address = input.at(i)->data();
@@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
            infer_next_address,
            next_address));
  }
-  numel += paddle::platform::Alignment(
+  numel += phi::Alignment((*input.rbegin())->numel() * size_of_dtype,
-      (*input.rbegin())->numel() * size_of_dtype, dev_ctx.GetPlace());
+                          dev_ctx.GetPlace());
  // reset holder, do inplace
  output->ShareBufferWith(*input.at(0));
  output->Resize({numel / size_of_dtype});

--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -17,8 +17,8 @@
 #include <sstream>
 #include <vector>
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
                      0,
                      errors::InvalidArgument(
                          "The number of `%d`-th tensor's elements is 0.", i));
-    auto len = use_align ? paddle::platform::Alignment(
+    auto len = use_align
-                               static_cast<size_t>(size) * size_of_dtype,
+                   ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                               place,
+                                    place,
-                               align_size) /
+                                    align_size) /
-                               size_of_dtype
+                         size_of_dtype
-                         : static_cast<size_t>(size);
+                   : static_cast<size_t>(size);
    const void *ptr =
        lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
    VLOG(4) << size << " " << len;
@@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
      phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor);
      offset += use_align
-                    ? paddle::platform::Alignment(
+                    ? phi::Alignment(
                          len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                          size_of_dtype
                    : len;
@@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
        phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
      }
      offset += use_align
-                    ? paddle::platform::Alignment(
+                    ? phi::Alignment(
                          len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                          size_of_dtype
                    : len;
@@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
        ->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset),
                                            static_cast<int64_t>(offset + len)))
        .Resize(dim);
-    len = use_align ? paddle::platform::Alignment(
+    len = use_align ? phi::Alignment(
                          len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                          size_of_dtype
                    : len;