diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a15a1cd84b14094c6ea95f94ffaaf31f4a790376..9c7e5c6b27e68ee10be5f8b56d6de4aea4524078 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" +#if defined(PADDLE_WITH_CUDA) +#include +#endif #include #include #include // NOLINT @@ -405,4 +408,83 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Tensor, CpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + w0->ShareExternalData(input_data[0].data(), {4, 1}, PlaceType::kCPU); + w1->ShareExternalData(input_data[1].data(), {4, 1}, PlaceType::kCPU); + w2->ShareExternalData(input_data[2].data(), {4, 1}, PlaceType::kCPU); + w3->ShareExternalData(input_data[3].data(), {4, 1}, PlaceType::kCPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + std::vector out_data; + out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies())); + out->ShareExternalData(out_data.data(), out_shape, PlaceType::kCPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} + +#if defined(PADDLE_WITH_CUDA) +TEST(Tensor, GpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + std::vector input_gpu(4, nullptr); + + for (size_t i = 0; i < 4; ++i) { + cudaMalloc(reinterpret_cast(&input_gpu[i]), 4 * sizeof(int64_t)); + cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t), + cudaMemcpyHostToDevice); + } + + w0->ShareExternalData(input_gpu[0], {4, 1}, PlaceType::kGPU); + w1->ShareExternalData(input_gpu[1], {4, 1}, PlaceType::kGPU); + w2->ShareExternalData(input_gpu[2], {4, 1}, PlaceType::kGPU); + w3->ShareExternalData(input_gpu[3], {4, 1}, PlaceType::kGPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + float* out_data; + auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies()) * + sizeof(float); + cudaMalloc(reinterpret_cast(out_data), out_size * sizeof(float)); + out->ShareExternalData(out_data, out_shape, PlaceType::kGPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 1d09b01f8f852f2bb7f668d0e2b4ee3250c9cc64..18b1d09f0e8a7c4be9862991060a4706ee7cde7e 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/allocator.h" namespace paddle_infer { @@ -205,6 +206,73 @@ void Tensor::CopyFromCpu(const T *data) { } } +template +struct DataTypeInfo; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT32; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT16; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT64; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::UINT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT32; +}; + +paddle::experimental::DataLayout LayoutConvert(DataLayout layout) { + PADDLE_ENFORCE_EQ( + layout, DataLayout::kNCHW, + paddle::platform::errors::InvalidArgument("Only NCHW is supported now.")); + return paddle::experimental::DataLayout::NCHW; +} + +template +void Tensor::ShareExternalData(const T *data, const std::vector &shape, + PlaceType place, DataLayout layout) { + EAGER_GET_TENSOR(paddle::framework::LoDTensor) + size_t size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + phi::DenseTensorMeta meta(DataTypeInfo().TYPE, phi::make_ddim(shape), + LayoutConvert(layout)); + if (place == PlaceType::kCPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CPUPlace()), + meta); + *tensor = std::move(dtensor); + } else if (place == PlaceType::kGPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CUDAPlace(device_)), + meta); + *tensor = std::move(dtensor); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "PlaceType must be PlaceType::kCPU or PlaceType::kGPU.")); + } +} + void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { EAGER_GET_TENSOR(paddle_infer::Strings); PADDLE_ENFORCE_GE(tensor->size(), 0, @@ -334,6 +402,25 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const uint8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int64_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int32_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const uint8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float16 *data, const std::vector &shape, PlaceType place, + DataLayout layout); + template PD_INFER_DECL void Tensor::CopyToCpu(float *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data) const; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 81eecbb2c1480499b81556c48d021a8ff8929899..5a98d109aed79cc5bcefdc01b47a166bdf9c01d9 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -47,6 +47,8 @@ enum DataType { enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU }; +enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW }; + /// \brief Represents an n-dimensional array of values. /// The Tensor is used to store the input or output of the network. /// Zero copy means that the tensor supports direct copy of host or device data @@ -92,6 +94,17 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Share the data with tensor data. + /// It's usually used to set the tensor data. + /// \param data The pointer of the data, from which the tensor will share. + /// \param shape The shape of data. + /// \param place The place of data. + /// \param layout The layout of data. Only NCHW is supported now. + template + void ShareExternalData(const T* data, const std::vector& shape, + PlaceType place, + DataLayout layout = DataLayout::kNCHW); + /// \brief Experimental interface. /// It's usually used to set the input tensor data with Strings data type. /// \param data The pointer of the data, from which the tensor will copy.