// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/phi/kernels/coalesce_tensor_kernel.h" #include #include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/device_memory_aligment.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/kernels/funcs/math_function_impl.h" #endif namespace phi { template struct FillConstantVisitor { FillConstantVisitor(const Context &dev_ctx, DenseTensor *tensor, const float value) : dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {} template void apply(typename std::enable_if::value || std::is_same::value>::type * = nullptr) const { PADDLE_THROW( errors::InvalidArgument("Not support data type for set_constant attr")); } template void apply(typename std::enable_if::value || std::is_same::value)>::type * = nullptr) const { phi::funcs::SetConstant set_constant; set_constant(dev_ctx_, tensor_, static_cast(value_)); } const Context &dev_ctx_; DenseTensor *tensor_; float value_; }; void GetMemSizeAndDtype(const std::vector &lod_tensors, size_t *numel, const size_t &size_of_dtype, const phi::Place &place, const bool use_align = true, const int align_size = -1) { *numel = 0; std::stringstream ss; ss << "alloc_space_for_vars: "; for (size_t i = 0; i < lod_tensors.size(); ++i) { auto size = lod_tensors[i]->numel(); PADDLE_ENFORCE_GT(size, 0, errors::InvalidArgument( "The number of `%d`-th tensor's elements is 0.", i)); auto len = use_align ? phi::Alignment(static_cast(size) * size_of_dtype, place, align_size) / size_of_dtype : static_cast(size); const void *ptr = lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; ss << "input(" << i << "-th tensor) dim:(" << lod_tensors[i]->dims() << ") " << " addres:" << ptr << " len: " << len << ", "; *numel += len; } VLOG(10) << ss.str(); } template void CoalesceTensorKernel(const Context &dev_ctx, const std::vector &input, DataType dtype, bool copy_data, bool set_constant, bool persist_output, float constant, bool use_align, int align_size, int size_of_dtype, const std::vector &concated_shapes, const std::vector &concated_ranks, std::vector output, DenseTensor *fused_output) { PADDLE_ENFORCE_GT( input.size(), static_cast(0), errors::InvalidArgument("The CoalesceTensor operator has no input.")); PADDLE_ENFORCE_EQ(input.size(), output.size(), errors::InvalidArgument( "The number of CoalesceTensor operator's input and " "output is not match, " "input number is %u, output number is %u.", input.size(), output.size())); // Input & Output check: only support LoDTensor bool has_not_init_in_vars = false; for (size_t i = 0; i < input.size(); ++i) { PADDLE_ENFORCE_NOT_NULL( input[i], errors::InvalidArgument("The %d-th input tensor cannot be nullptr.", i)); PADDLE_ENFORCE_NOT_NULL( output[i], errors::InvalidArgument("The %d-th output tensor cannot be nullptr.", i)); if (!input[i]->IsInitialized()) { has_not_init_in_vars = true; } } if (has_not_init_in_vars) { PADDLE_ENFORCE_EQ( concated_ranks.size(), output.size(), errors::InvalidArgument("The attribute(concated_ranks) length must be " "equal to the output tensor number.")); int64_t accumulated_ranks = 0; for (size_t i = 0; i < input.size(); ++i) { phi::DDim dims(concated_shapes.data() + accumulated_ranks, concated_ranks[i]); if (!input[i]->IsInitialized()) { PADDLE_ENFORCE_EQ( input[i], output[i], errors::InvalidArgument( "The %d-th output tensor and %d-th input tensor when the " "%d-th input tensor is not initialized.", i, i, i)); output[i]->Resize(dims); } else { PADDLE_ENFORCE_EQ(input[i]->dims(), dims, errors::InvalidArgument( "The %d-th input tensor shape does not match the " "attribute(concated_shapes) and " "attribute(concated_ranks).", i)); } accumulated_ranks += concated_ranks[i]; PADDLE_ENFORCE_LE( accumulated_ranks, concated_shapes.size(), errors::InvalidArgument("The attribute(concated_shapes) and " "attribute(concated_ranks) do not match.")); } PADDLE_ENFORCE_EQ( accumulated_ranks, concated_shapes.size(), errors::InvalidArgument("The attribute(concated_shapes) and " "attribute(concated_ranks) do not match.")); } // Init the output as input for (size_t i = 0; i < input.size(); ++i) { output[i]->Resize(input[i]->dims()); } // Get numel and dtype size_t numel = 0; if (size_of_dtype == -1) { size_of_dtype = paddle::experimental::SizeOf(dtype); } GetMemSizeAndDtype( input, &numel, size_of_dtype, dev_ctx.GetPlace(), use_align, align_size); // Alloc the continuous space void *fused_tensor_ptr = dev_ctx.Alloc( &fused_output->Resize(phi::make_ddim({static_cast(numel)})), dtype); VLOG(10) << "Fused tensor addr " << fused_tensor_ptr; // Init the continuous space size_t offset = 0; if (copy_data) { for (size_t i = 0; i < input.size(); ++i) { size_t len = static_cast(input[i]->numel()); auto sub_tensor = fused_output->Slice(static_cast(offset), static_cast(offset + len)); phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor); offset += use_align ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len; } } else if (set_constant) { phi::VisitDataType( dtype, FillConstantVisitor(dev_ctx, fused_output, constant)); } else if (persist_output) { for (size_t i = 0; i < output.size(); ++i) { size_t len = static_cast(output[i]->numel()); auto sub_tensor = fused_output->Slice(static_cast(offset), static_cast(offset + len)); // some var may not persistable, or persistable var may not init if (output[i]->IsInitialized()) { phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor); } offset += use_align ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len; } } // Make the outputs point to the continuous space. offset = 0; std::stringstream ss; ss << "alloc_space_for_vars: "; for (size_t i = 0; i < output.size(); ++i) { size_t len = static_cast(output[i]->numel()); auto dim = output[i]->dims(); VLOG(4) << len << " " << dim << " " << offset; output[i] ->ShareDataWith(fused_output->Slice(static_cast(offset), static_cast(offset + len))) .Resize(dim); len = use_align ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len; ss << "output(" << i << "-th tensor) dim:(" << dim << ")" << " address: " << output[i]->data() << " len: " << len << ", "; offset += len; } PADDLE_ENFORCE_EQ((int64_t)offset, fused_output->numel(), errors::InvalidArgument( "The alloc_space_for_vars's offset: %s is unequal with " "fused_output's numel: %s.", offset, fused_output->numel())); VLOG(10) << ss.str(); } } // namespace phi PD_REGISTER_KERNEL(coalesce_tensor, CPU, ALL_LAYOUT, phi::CoalesceTensorKernel, int, float, double) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(coalesce_tensor, GPU, ALL_LAYOUT, phi::CoalesceTensorKernel, phi::dtype::float16, int, float, double) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } #endif #ifdef PADDLE_WITH_XPU PD_REGISTER_KERNEL(coalesce_tensor, XPU, ALL_LAYOUT, phi::CoalesceTensorKernel, phi::dtype::float16, int, float, double) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } #endif