// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include #include "llvm/Support/ErrorHandling.h" #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" #include "paddle/infrt/paddle/model_parser.h" #include "paddle/infrt/paddle/scope.h" #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/dense_tensor.h" #ifdef INFRT_WITH_GPU #include #endif namespace infrt { namespace kernel { namespace phi { ::phi::DenseTensor CreateDenseTensor( const ::phi::CPUContext& context, host_context::Attribute> dims, host_context::Attribute> lod, host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision) { return ::phi::DenseTensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), ::phi::make_ddim(dims.get()), ConvertLayoutToPhi(layout.get()), {})); } ::phi::DenseTensor CreateInitedDenseTensorF32( const ::phi::CPUContext& context, host_context::Attribute> dims, host_context::Attribute> lod, host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute value) { ::phi::DenseTensor dense_tensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), ::phi::DenseTensorMeta( ConvertPrecisionToPhi(::infrt::PrecisionType::FLOAT32), ::phi::make_ddim(dims.get()), ConvertLayoutToPhi(layout.get()), {})); float* a_data = dense_tensor.mutable_data(::phi::CPUPlace()); for (int64_t i = 0; i < dense_tensor.numel(); ++i) { a_data[i] = value.get(); } return dense_tensor; } ::phi::DenseTensor CreateHostInitedDenseTensorF32( const ::phi::CPUContext& context, host_context::Attribute> dims, host_context::Attribute> lod, host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute> values) { ::phi::DenseTensor dense_tensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), ::phi::DenseTensorMeta( ConvertPrecisionToPhi(::infrt::PrecisionType::FLOAT32), ::phi::make_ddim(dims.get()), ConvertLayoutToPhi(layout.get()), {})); CHECK_EQ(dense_tensor.numel(), static_cast(values.get().size())); float* data = dense_tensor.mutable_data(::phi::CPUPlace()); for (int64_t i = 0; i < dense_tensor.numel(); ++i) { data[i] = values.get()[i]; } return dense_tensor; } ::phi::DenseTensor CreateGPUDenseTensor( const ::phi::GPUContext& context, host_context::Attribute> dims, host_context::Attribute> lod, host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision) { return ::phi::DenseTensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), ::phi::make_ddim(dims.get()), ConvertLayoutToPhi(layout.get()), {})); } void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> value) { auto place = dense_tensor->place(); float* a_data = dense_tensor->mutable_data(place); if (place.GetType() == ::phi::AllocationType::CPU) { for (int64_t i = 0; i < dense_tensor->numel(); ++i) { a_data[i] = (value.get())[i]; } } else if (place.GetType() == ::phi::AllocationType::GPU) { #ifdef INFRT_WITH_GPU // TODO(wilber): how to set the stream parameter to copy with stream. cudaMemcpy(a_data, value.get().data(), sizeof(float) * value.get().size(), cudaMemcpyHostToDevice); #endif } else { llvm_unreachable("temporarily not support other target."); } } void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { #ifndef INFRT_WITH_GPU #define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ case ::phi::DataType::PHI_DATATYPE: { \ auto place = dense_tensor->place(); \ if (place.GetType() == ::phi::AllocationType::CPU) { \ DTYPE* data = dense_tensor->data(); \ if (dense_tensor->numel() == 0) break; \ std::cout << data[0]; \ for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ std::cout << "," << data[i]; \ } \ } \ break; \ } #else #define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ case ::phi::DataType::PHI_DATATYPE: { \ auto place = dense_tensor->place(); \ DTYPE* data = dense_tensor->data(); \ if (dense_tensor->numel() == 0) break; \ if (place.GetType() == ::phi::AllocationType::CPU) { \ std::cout << data[0]; \ for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ std::cout << "," << data[i]; \ } \ } else if (place.GetType() == ::phi::AllocationType::GPU) { \ std::vector host_data(dense_tensor->numel(), 0); \ cudaMemcpy(host_data.data(), \ data, \ sizeof(DTYPE) * dense_tensor->numel(), \ cudaMemcpyDeviceToHost); \ std::cout << host_data[0]; \ for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ std::cout << "," << host_data[i]; \ } \ } else { \ llvm_unreachable("temporarily not support other target."); \ } \ break; \ } #endif ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," << " value=["; switch (dense_tensor->dtype()) { PRINT_META_DATA(FLOAT32, float); PRINT_META_DATA(INT32, int32_t); default: std::cout << "Error! Unsupported data type!\n"; } std::cout << "]\n"; #undef PRINT_META_DATA } ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) { std::cout << "loading params from: " << file_path << std::endl; ::infrt::phi::DenseTensorMap map; const std::string model_path = file_path + "/__model__"; auto pb_proto_prog = paddle::LoadProgram(model_path); auto main_block = pb_proto_prog->blocks(0); ::phi::CPUContext ctx; auto allocator = std::make_unique(); const auto* allocator_ptr = allocator.get(); ctx.SetAllocator(allocator_ptr); ctx.SetHostAllocator(allocator_ptr); ctx.SetZeroAllocator(allocator_ptr); for (auto& var : main_block.vars()) { if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) continue; std::string param_path = file_path + "/" + var.name(); std::ifstream param_file(param_path, std::ios::binary); switch (var.type().type()) { case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: { std::unique_ptr<::phi::DenseTensor> tensor{ std::make_unique<::phi::DenseTensor>()}; ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx); map.SetDenseTensor(var.name(), std::move(tensor)); } break; default: { LOG(WARNING) << "Var `" << var.name() << "` type `" << static_cast(var.type().type()) << "` has not been supported now."; } } } return map; } ::infrt::phi::DenseTensorMap LoadParams( host_context::Attribute path) { return LoadParameters(path.get()); } ::infrt::phi::DenseTensorMap LoadCombinedParameters( const std::string& model_path, const std::string& params_path) { ::infrt::phi::DenseTensorMap map; auto pb_proto_prog = paddle::LoadProgram(model_path); auto main_block = pb_proto_prog->blocks(0); std::ifstream param_file(params_path, std::ios::binary); std::set tmp; for (auto& var : main_block.vars()) { if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) { continue; } if (var.type().type() == ::paddle::framework::proto::VarType_Type_LOD_TENSOR) { tmp.emplace(var.name()); } else { llvm_unreachable("the tensor type is illegal."); } } ::phi::CPUContext ctx; auto allocator = std::make_unique(); const auto* allocator_ptr = allocator.get(); ctx.SetAllocator(allocator_ptr); ctx.SetHostAllocator(allocator_ptr); ctx.SetZeroAllocator(allocator_ptr); for (auto& var : tmp) { std::unique_ptr<::phi::DenseTensor> tensor{ std::make_unique<::phi::DenseTensor>()}; ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx); map.SetDenseTensor(var, std::move(tensor)); } return map; } ::infrt::phi::DenseTensorMap LoadCombinedParams( host_context::Attribute model_path, host_context::Attribute params_path) { return LoadCombinedParameters(model_path.get(), params_path.get()); } ::phi::DenseTensor TensorMapGetTensor( const ::infrt::phi::DenseTensorMap& map, host_context::Attribute name) { auto* tensor = map.GetDenseTensor(name.get()); CHECK(tensor); return *tensor; } int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) { return map.size(); } #ifdef INFRT_WITH_GPU inline size_t SizeOfDataType(::phi::DataType data_type) { switch (data_type) { case ::phi::DataType::BOOL: case ::phi::DataType::UINT8: case ::phi::DataType::INT8: return 1; case ::phi::DataType::BFLOAT16: case ::phi::DataType::FLOAT16: case ::phi::DataType::INT16: case ::phi::DataType::UINT16: return 2; case ::phi::DataType::FLOAT32: case ::phi::DataType::INT32: case ::phi::DataType::UINT32: return 4; case ::phi::DataType::FLOAT64: case ::phi::DataType::INT64: case ::phi::DataType::UINT64: case ::phi::DataType::COMPLEX64: return 8; case ::phi::DataType::COMPLEX128: return 16; case ::phi::DataType::UNDEFINED: return 0; default: llvm_unreachable("should not reach here"); return 0; } return 0; } void GpuMemCpy(const ::phi::DenseTensor& input, const ::phi::GPUContext& context, bool d2h, ::phi::DenseTensor* output) { if (d2h) { CHECK(input.place().GetType() == ::phi::AllocationType::GPU); // TODO(wilber): Just a trick to avoid malloc. if (input.numel() > output->numel()) { // TODO(wilber): Use pinned memory. output->Resize(input.dims()); context.HostAlloc( output, input.dtype(), input.numel() * SizeOfDataType(input.dtype())); } cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyDeviceToHost, context.stream()); // TODO(wilber): Ir add sync op. cudaStreamSynchronize(context.stream()); } else { // h2d CHECK(input.place().GetType() == ::phi::AllocationType::CPU || input.place().GetType() == ::phi::AllocationType::GPUPINNED); if (input.numel() > output->numel()) { output->Resize(input.dims()); context.Alloc(output, input.dtype(), input.numel() * SizeOfDataType(input.dtype()), false); } else { output->Resize(input.dims()); } // TODO(wilber): Add sync op and stream. cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyHostToDevice, context.stream()); } } #endif } // namespace phi } // namespace kernel } // namespace infrt