// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/api/paddle_infer_contrib.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" namespace paddle_infer { namespace contrib { using paddle::PaddleDType; void* TensorUtils::CudaMallocPinnedMemory(size_t size) { #if defined(PADDLE_WITH_CUDA) void* ptr = nullptr; PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size)); return ptr; #else return nullptr; #endif } void TensorUtils::CudaFreePinnedMemory(void* ptr) { #if defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr)); #endif } void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src, void* exec_stream, CallbackFunc cb, void* cb_params) { Tensor& dst = *p_dst; dst.Reshape(src.shape()); PADDLE_ENFORCE( src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU, paddle::platform::errors::InvalidArgument( "CopyTensor only support PlaceType kCPU/kGPU now.")); PADDLE_ENFORCE( dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU, paddle::platform::errors::InvalidArgument( "CopyTensor only support PlaceType kCPU/kGPU now.")); // copy to cpu, gpu => cpu or cpu => cpu if (dst.place() == PlaceType::kCPU) { switch (src.type()) { case PaddleDType::INT32: src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU), exec_stream, cb, cb_params); break; case PaddleDType::INT64: src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU), exec_stream, cb, cb_params); break; case PaddleDType::FLOAT32: src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream, cb, cb_params); break; case PaddleDType::UINT8: src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU), exec_stream, cb, cb_params); break; case PaddleDType::INT8: src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU), exec_stream, cb, cb_params); break; case PaddleDType::FLOAT16: src.CopyToCpuImpl( dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU), exec_stream, cb, cb_params); break; default: PADDLE_THROW(paddle::platform::errors::Unimplemented( "Only INT32, INT64, UINT8, INT8, FLOAT16 and " "FLOAT32 is supported in Tensor. Others not implements")); } // gpu => gpu or cpu => gpu } else { #if defined(PADDLE_WITH_CUDA) void* dst_data = nullptr; void* src_data = nullptr; size_t data_len = 0; int data_size = 0; PlaceType src_place; switch (src.type()) { case PaddleDType::INT32: dst_data = static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU)); src_data = static_cast<void*>(src.data<int32_t>(&src_place, &data_size)); data_len = data_size * sizeof(int32_t); break; case PaddleDType::INT64: dst_data = static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU)); src_data = static_cast<void*>(src.data<int64_t>(&src_place, &data_size)); data_len = data_size * sizeof(int64_t); break; case PaddleDType::FLOAT32: dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU)); src_data = static_cast<void*>(src.data<float>(&src_place, &data_size)); data_len = data_size * sizeof(float); break; case PaddleDType::UINT8: dst_data = static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU)); src_data = static_cast<void*>(src.data<uint8_t>(&src_place, &data_size)); data_len = data_size * sizeof(uint8_t); break; case PaddleDType::INT8: dst_data = static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU)); src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size)); data_len = data_size * sizeof(int8_t); break; case PaddleDType::FLOAT16: dst_data = static_cast<void*>( dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU)); src_data = static_cast<void*>( src.data<paddle::platform::float16>(&src_place, &data_size)); data_len = data_size * 2; break; default: PADDLE_THROW(paddle::platform::errors::Unimplemented( "Only INT32, INT64, UINT8, INT8, FLOAT16 and " "FLOAT32 is supported in Tensor. Others not implements")); } paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); paddle::platform::CUDAPlace gpu_place(dst.device_); auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>( pool.Get(gpu_place)); if (src.place() == PlaceType::kCPU) { paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data), paddle::platform::CPUPlace(), src_data, data_len, dev_ctx->stream()); } else { paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data), paddle::platform::CUDAPlace(), src_data, data_len, dev_ctx->stream()); } if (nullptr != exec_stream) { *(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream(); } else if (cb) { cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params); } else { cudaStreamSynchronize(dev_ctx->stream()); } #else PADDLE_THROW(paddle::platform::errors::Unavailable( "Can not copy tensor to GPU CUDA place because paddle is not compiled " "with CUDA.")); #endif } return; } void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) { CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr); } void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src, void* exec_stream) { CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr); } void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb, void* cb_params) { CopyTensorImpl(p_dst, src, nullptr, cb, cb_params); } } // namespace contrib } // namespace paddle_infer