From ac33c0caa503970b21c9dc58f7bee88231f9eff7 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Thu, 26 Aug 2021 14:59:36 +0800 Subject: [PATCH] Add copy from tensor (#34406) * add api * temp save * revert * copytocpu async ok * fix style * copy sync ok * fix compile error * fix compile error * api done * update python async api * fix compile * remove async python api; add c++ async unittest * remove python async api * update unittest * update unittest * add C++ unittest for copytensor * add unittest * update namespace utils to class TensorUtils * add unittest * update unittest * update unittest * update code style * update code style * update unittest --- cmake/configure.cmake | 4 + paddle/fluid/inference/api/CMakeLists.txt | 5 +- .../inference/api/details/zero_copy_tensor.cc | 73 +++- .../inference/api/paddle_infer_contrib.cc | 190 ++++++++++ .../inference/api/paddle_infer_contrib.h | 40 +++ paddle/fluid/inference/api/paddle_tensor.h | 39 ++- .../fluid/inference/tests/api/CMakeLists.txt | 5 + .../paddle_infer_api_copy_tensor_tester.cc | 329 ++++++++++++++++++ paddle/fluid/pybind/inference_api.cc | 8 + python/paddle/inference/contrib/__init__.py | 13 + .../inference/contrib/utils/__init__.py | 15 + 11 files changed, 710 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/inference/api/paddle_infer_contrib.cc create mode 100644 paddle/fluid/inference/api/paddle_infer_contrib.h create mode 100644 paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc create mode 100644 python/paddle/inference/contrib/__init__.py create mode 100644 python/paddle/inference/contrib/utils/__init__.py diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 458ab992c2..3a7f269eaa 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -20,6 +20,10 @@ if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) +if(WITH_INFERENCE_API_TEST) + add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST) +endif(WITH_INFERENCE_API_TEST) + if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9e49dea9e6..3bf5cc262e 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -28,14 +28,15 @@ if(WITH_MKLDNN) endif() cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer) +cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) if(WITH_CRYPTO) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator) else() cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto custom_operator) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) endif() if(WIN32) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index ff167aa7cf..5cb8725b4d 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -121,6 +121,8 @@ DataType Tensor::type() const { return DataType::FLOAT32; } +PlaceType Tensor::place() const { return place_; } + template void Tensor::CopyFromCpu(const T *data) { EAGER_GET_TENSOR; @@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) { } template -void Tensor::CopyToCpu(T *data) { +void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, + void *cb_params) const { EAGER_GET_TENSOR; auto ele_num = tensor->numel(); auto *t_data = tensor->data(); @@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) { #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); #else - cudaStreamSynchronize(dev_ctx->stream()); + // async, return stream + if (nullptr != exec_stream) { + *(static_cast(exec_stream)) = dev_ctx->stream(); + // async with callback + } else if (cb) { + cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params); + // sync + } else { + cudaStreamSynchronize(dev_ctx->stream()); + } #endif #else PADDLE_THROW(paddle::platform::errors::Unavailable( @@ -261,6 +273,22 @@ void Tensor::CopyToCpu(T *data) { "The analysis predictor supports CPU, GPU, NPU and XPU now.")); } } + +template +void Tensor::CopyToCpu(T *data) const { + CopyToCpuImpl(data, nullptr, nullptr, nullptr); +} + +template +void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const { + CopyToCpuImpl(data, exec_stream, nullptr, nullptr); +} + +template +void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const { + CopyToCpuImpl(data, nullptr, cb, cb_params); +} + template PD_INFER_DECL void Tensor::CopyFromCpu(const float *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int64_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int32_t *data); @@ -268,12 +296,38 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const uint8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); -template PD_INFER_DECL void Tensor::CopyToCpu(float *data); -template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data); -template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data); -template PD_INFER_DECL void Tensor::CopyToCpu(uint8_t *data); -template PD_INFER_DECL void Tensor::CopyToCpu(int8_t *data); -template PD_INFER_DECL void Tensor::CopyToCpu(float16 *data); +template PD_INFER_DECL void Tensor::CopyToCpu(float *data) const; +template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data) const; +template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data) const; +template PD_INFER_DECL void Tensor::CopyToCpu(uint8_t *data) const; +template PD_INFER_DECL void Tensor::CopyToCpu(int8_t *data) const; +template PD_INFER_DECL void Tensor::CopyToCpu(float16 *data) const; + +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + float *data, void *exec_stream) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int64_t *data, void *exec_stream) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int32_t *data, void *exec_stream) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + uint8_t *data, void *exec_stream) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int8_t *data, void *exec_stream) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + float16 *data, void *exec_stream) const; + +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + float *data, CallbackFunc cb, void *cb_params) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int64_t *data, CallbackFunc cb, void *cb_params) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int32_t *data, CallbackFunc cb, void *cb_params) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + uint8_t *data, CallbackFunc cb, void *cb_params) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + int8_t *data, CallbackFunc cb, void *cb_params) const; +template PD_INFER_DECL void Tensor::CopyToCpuAsync( + float16 *data, CallbackFunc cb, void *cb_params) const; template PD_INFER_DECL float *Tensor::data(PlaceType *place, int *size) const; @@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data(PlaceType *place, int *size) const; template PD_INFER_DECL int8_t *Tensor::data(PlaceType *place, int *size) const; +template PD_INFER_DECL float16 *Tensor::data(PlaceType *place, + int *size) const; template PD_INFER_DECL float *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int64_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int32_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL uint8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int8_t *Tensor::mutable_data(PlaceType place); +template PD_INFER_DECL float16 *Tensor::mutable_data(PlaceType place); Tensor::Tensor(void *scope) : scope_{scope} { PADDLE_ENFORCE_NOT_NULL(scope_, diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc new file mode 100644 index 0000000000..aad1c3fa6f --- /dev/null +++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_infer_contrib.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle_infer { +namespace contrib { + +using paddle::PaddleDType; + +void* TensorUtils::CudaMallocPinnedMemory(size_t size) { +#if defined(PADDLE_WITH_CUDA) + void* ptr = nullptr; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size)); + return ptr; +#else + return nullptr; +#endif +} + +void TensorUtils::CudaFreePinnedMemory(void* ptr) { +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr)); +#endif +} + +void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src, + void* exec_stream, CallbackFunc cb, + void* cb_params) { + Tensor& dst = *p_dst; + dst.Reshape(src.shape()); + PADDLE_ENFORCE( + src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU, + paddle::platform::errors::InvalidArgument( + "CopyTensor only support PlaceType kCPU/kGPU now.")); + PADDLE_ENFORCE( + dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU, + paddle::platform::errors::InvalidArgument( + "CopyTensor only support PlaceType kCPU/kGPU now.")); + // copy to cpu, gpu => cpu or cpu => cpu + if (dst.place() == PlaceType::kCPU) { + switch (src.type()) { + case PaddleDType::INT32: + src.CopyToCpuImpl(dst.mutable_data(PlaceType::kCPU), + exec_stream, cb, cb_params); + break; + case PaddleDType::INT64: + src.CopyToCpuImpl(dst.mutable_data(PlaceType::kCPU), + exec_stream, cb, cb_params); + break; + case PaddleDType::FLOAT32: + src.CopyToCpuImpl(dst.mutable_data(PlaceType::kCPU), exec_stream, + cb, cb_params); + break; + case PaddleDType::UINT8: + src.CopyToCpuImpl(dst.mutable_data(PlaceType::kCPU), + exec_stream, cb, cb_params); + break; + case PaddleDType::INT8: + src.CopyToCpuImpl(dst.mutable_data(PlaceType::kCPU), + exec_stream, cb, cb_params); + break; + case PaddleDType::FLOAT16: + src.CopyToCpuImpl( + dst.mutable_data(PlaceType::kCPU), + exec_stream, cb, cb_params); + break; + default: + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Only INT32, INT64, UINT8, INT8, FLOAT16 and " + "FLOAT32 is supported in Tensor. Others not implements")); + } + // gpu => gpu or cpu => gpu + } else { +#if defined(PADDLE_WITH_CUDA) + void* dst_data = nullptr; + void* src_data = nullptr; + size_t data_len = 0; + int data_size = 0; + PlaceType src_place; + switch (src.type()) { + case PaddleDType::INT32: + dst_data = + static_cast(dst.mutable_data(PlaceType::kGPU)); + src_data = + static_cast(src.data(&src_place, &data_size)); + data_len = data_size * sizeof(int32_t); + break; + case PaddleDType::INT64: + dst_data = + static_cast(dst.mutable_data(PlaceType::kGPU)); + src_data = + static_cast(src.data(&src_place, &data_size)); + data_len = data_size * sizeof(int64_t); + break; + case PaddleDType::FLOAT32: + dst_data = static_cast(dst.mutable_data(PlaceType::kGPU)); + src_data = static_cast(src.data(&src_place, &data_size)); + data_len = data_size * sizeof(float); + break; + case PaddleDType::UINT8: + dst_data = + static_cast(dst.mutable_data(PlaceType::kGPU)); + src_data = + static_cast(src.data(&src_place, &data_size)); + data_len = data_size * sizeof(uint8_t); + break; + case PaddleDType::INT8: + dst_data = + static_cast(dst.mutable_data(PlaceType::kGPU)); + src_data = static_cast(src.data(&src_place, &data_size)); + data_len = data_size * sizeof(int8_t); + break; + case PaddleDType::FLOAT16: + dst_data = static_cast( + dst.mutable_data(PlaceType::kGPU)); + src_data = static_cast( + src.data(&src_place, &data_size)); + data_len = data_size * 2; + break; + default: + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Only INT32, INT64, UINT8, INT8, FLOAT16 and " + "FLOAT32 is supported in Tensor. Others not implements")); + } + + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + paddle::platform::CUDAPlace gpu_place(dst.device_); + auto* dev_ctx = static_cast( + pool.Get(gpu_place)); + + if (src.place() == PlaceType::kCPU) { + paddle::memory::Copy(gpu_place, static_cast(dst_data), + paddle::platform::CPUPlace(), src_data, data_len, + dev_ctx->stream()); + } else { + paddle::memory::Copy(gpu_place, static_cast(dst_data), + paddle::platform::CUDAPlace(), src_data, data_len, + dev_ctx->stream()); + } + + if (nullptr != exec_stream) { + *(static_cast(exec_stream)) = dev_ctx->stream(); + } else if (cb) { + cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params); + } else { + cudaStreamSynchronize(dev_ctx->stream()); + } +#else + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Can not copy tensor to GPU CUDA place because paddle is not compiled " + "with CUDA.")); +#endif + } + return; +} + +void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) { + CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr); +} + +void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src, + void* exec_stream) { + CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr); +} + +void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src, + CallbackFunc cb, void* cb_params) { + CopyTensorImpl(p_dst, src, nullptr, cb, cb_params); +} + +} // namespace contrib +} // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.h b/paddle/fluid/inference/api/paddle_infer_contrib.h new file mode 100644 index 0000000000..7d35567e43 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_infer_contrib.h @@ -0,0 +1,40 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle_infer { +namespace contrib { + +class TensorUtils { + public: + static void* CudaMallocPinnedMemory(size_t size); + static void CudaFreePinnedMemory(void* mem); + + static void CopyTensor(Tensor* p_dst, const Tensor& src); + static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, + void* exec_stream); + static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb, + void* cb_params); + + private: + static void CopyTensorImpl(Tensor* p_dst, const Tensor& src, + void* exec_stream, CallbackFunc cb, + void* cb_params); +}; + +} // namespace contrib +} // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 1f813d52ef..f6dce74c30 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -18,6 +18,16 @@ namespace paddle_infer { +typedef void (*CallbackFunc)(void*); + +#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) +class InferApiTesterUtils; +#endif + +namespace contrib { +class TensorUtils; +} + /// \brief Paddle data type. enum DataType { FLOAT32, @@ -72,7 +82,21 @@ class PD_INFER_DECL Tensor { /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. template - void CopyToCpu(T* data); + void CopyToCpu(T* data) const; + + /// \brief Copy the tensor data to the host memory asynchronously. + /// \param[out] data The tensor will copy the data to the address. + /// \param[out] exec_stream The tensor will excute copy in this stream(Only + /// GPU CUDA stream suppported now). + template + void CopyToCpuAsync(T* data, void* exec_stream) const; + + /// \brief Copy the tensor data to the host memory asynchronously. + /// \param[out] data The tensor will copy the data to the address. + /// \param[out] cb Callback function cb(cb_params) will be executed on the + /// host after all currently enqueued items in the stream have completed . + template + void CopyToCpuAsync(T* data, CallbackFunc cb, void* cb_params) const; /// \brief Return the shape of the Tensor. std::vector shape() const; @@ -92,12 +116,20 @@ class PD_INFER_DECL Tensor { /// \return The data type of the tensor. DataType type() const; + /// \brief Return the place type of the tensor. + /// \return The place type of the tensor. + PlaceType place() const; + protected: explicit Tensor(void* scope); void* FindTensor() const; void SetPlace(PlaceType place, int device = -1); void SetName(const std::string& name); + template + void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr, + void* cb_params = nullptr) const; + std::string name_; // The corresponding tensor pointer inside Paddle workspace is cached for // performance. @@ -107,6 +139,11 @@ class PD_INFER_DECL Tensor { void* scope_{nullptr}; PlaceType place_; int device_; + + friend class paddle_infer::contrib::TensorUtils; +#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) + friend class paddle_infer::InferApiTesterUtils; +#endif }; } // namespace paddle_infer diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f0eb0d1fa6..bb47558c69 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -682,6 +682,11 @@ if(WITH_GPU) inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) + + inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RESNET50_MODEL_DIR}) + set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30) endif() if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc new file mode 100644 index 0000000000..2be69781c4 --- /dev/null +++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc @@ -0,0 +1,329 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/inference/api/paddle_infer_contrib.h" +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle_infer { + +class InferApiTesterUtils { + public: + static std::unique_ptr CreateInferTensorForTest( + const std::string &name, PlaceType place, void *p_scope) { + auto var = static_cast(p_scope)->Var(name); + var->GetMutable(); + std::unique_ptr res(new Tensor(p_scope)); + res->input_or_output_ = true; + res->SetName(name); + res->SetPlace(place, 0 /*device id*/); + return res; + } +}; + +TEST(Tensor, copy_to_cpu_async_stream) { + LOG(INFO) << GetVersion(); + UpdateDllFlag("conv_workspace_size_limit", "4000"); + std::string model_dir = FLAGS_infer_model + "/model"; + Config config; + config.SetModel(model_dir + "/model", model_dir + "/params"); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + auto pred_clone = predictor->Clone(); + + std::vector in_shape = {1, 3, 318, 318}; + int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1, + [](int &a, int &b) { return a * b; }); + + std::vector input(in_num, 1.0); + + const auto &input_names = predictor->GetInputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + + input_tensor->Reshape(in_shape); + input_tensor->CopyFromCpu(input.data()); + + predictor->Run(); + + const auto &output_names = predictor->GetOutputNames(); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_tensor->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + + float *out_data = static_cast( + contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num)); + memset(out_data, 0, sizeof(float) * out_num); + std::vector correct_out_data = { + 127.78, 1.07353, -229.42, 1127.28, -177.365, + -292.412, -271.614, 466.054, 540.436, -214.223, + }; + + for (int i = 0; i < 100; i++) { + predictor->Run(); + } + + cudaStream_t stream; + output_tensor->CopyToCpuAsync(out_data, static_cast(&stream)); + + // sync + cudaStreamSynchronize(stream); + + for (int i = 0; i < 10; i++) { + EXPECT_NEAR(out_data[i] / correct_out_data[i], 1.0, 1e-3); + } + contrib::TensorUtils::CudaFreePinnedMemory(static_cast(out_data)); +} + +TEST(Tensor, copy_to_cpu_async_callback) { + LOG(INFO) << GetVersion(); + UpdateDllFlag("conv_workspace_size_limit", "4000"); + std::string model_dir = FLAGS_infer_model + "/model"; + Config config; + config.SetModel(model_dir + "/model", model_dir + "/params"); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + auto pred_clone = predictor->Clone(); + + std::vector in_shape = {1, 3, 318, 318}; + int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1, + [](int &a, int &b) { return a * b; }); + + std::vector input(in_num, 1.0); + + const auto &input_names = predictor->GetInputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + + input_tensor->Reshape(in_shape); + input_tensor->CopyFromCpu(input.data()); + + predictor->Run(); + + const auto &output_names = predictor->GetOutputNames(); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_tensor->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + + float *out_data = static_cast( + contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num)); + memset(out_data, 0, sizeof(float) * out_num); + + for (int i = 0; i < 100; i++) { + predictor->Run(); + } + + output_tensor->CopyToCpuAsync( + out_data, + [](void *cb_params) { + float *data = static_cast(cb_params); + std::vector correct_out_data = { + 127.78, 1.07353, -229.42, 1127.28, -177.365, + -292.412, -271.614, 466.054, 540.436, -214.223, + }; + for (int i = 0; i < 10; i++) { + EXPECT_NEAR(data[i] / correct_out_data[i], 1.0, 1e-3); + } + }, + static_cast(out_data)); + + cudaDeviceSynchronize(); + contrib::TensorUtils::CudaFreePinnedMemory(static_cast(out_data)); +} + +template +static void test_copy_tensor(PlaceType src_place, PlaceType dst_place) { + paddle::framework::Scope scope; + auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_src", src_place, static_cast(&scope)); + auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_dst", dst_place, static_cast(&scope)); + std::vector data_src(6, 1); + tensor_src->Reshape({2, 3}); + tensor_src->CopyFromCpu(data_src.data()); + + std::vector data_dst(4, 2); + tensor_dst->Reshape({2, 2}); + tensor_dst->CopyFromCpu(data_dst.data()); + + paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src); + + EXPECT_EQ(tensor_dst->shape().size(), (size_t)2); + EXPECT_EQ(tensor_dst->shape()[0], 2); + EXPECT_EQ(tensor_dst->shape()[1], 3); + + std::vector data_check(6, 3); + tensor_dst->CopyToCpu(static_cast(data_check.data())); + + for (int i = 0; i < 6; i++) { + EXPECT_NEAR(data_check[i], 1, 1e-5); + } +} + +TEST(CopyTensor, float32) { + test_copy_tensor(PlaceType::kCPU, PlaceType::kCPU); + test_copy_tensor(PlaceType::kCPU, PlaceType::kGPU); + test_copy_tensor(PlaceType::kGPU, PlaceType::kGPU); +} + +TEST(CopyTensor, int32) { + test_copy_tensor(PlaceType::kCPU, PlaceType::kCPU); + test_copy_tensor(PlaceType::kGPU, PlaceType::kGPU); +} + +TEST(CopyTensor, int64) { + test_copy_tensor(PlaceType::kCPU, PlaceType::kCPU); + test_copy_tensor(PlaceType::kGPU, PlaceType::kGPU); +} + +TEST(CopyTensor, int8) { + test_copy_tensor(PlaceType::kCPU, PlaceType::kCPU); + test_copy_tensor(PlaceType::kGPU, PlaceType::kGPU); +} + +TEST(CopyTensor, uint8) { + test_copy_tensor(PlaceType::kCPU, PlaceType::kCPU); + test_copy_tensor(PlaceType::kGPU, PlaceType::kGPU); +} + +TEST(CopyTensor, float16) { + paddle::framework::Scope scope; + auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_src", PlaceType::kCPU, static_cast(&scope)); + auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_dst", PlaceType::kCPU, static_cast(&scope)); + + using paddle::platform::float16; + std::vector data_src(6, float16(1.0)); + tensor_src->Reshape({2, 3}); + tensor_src->CopyFromCpu(data_src.data()); + + std::vector data_dst(4, float16(2.0)); + tensor_dst->Reshape({2, 2}); + tensor_dst->CopyFromCpu(data_dst.data()); + + paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src); + + EXPECT_EQ(tensor_dst->shape().size(), (size_t)2); + EXPECT_EQ(tensor_dst->shape()[0], 2); + EXPECT_EQ(tensor_dst->shape()[1], 3); + + std::vector data_check(6, float16(1.0)); + tensor_dst->CopyToCpu(data_check.data()); + + for (int i = 0; i < 6; i++) { + EXPECT_TRUE(data_check[i] == float16(1.0)); + } +} + +TEST(CopyTensor, float16_gpu) { + paddle::framework::Scope scope; + auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_src", PlaceType::kGPU, static_cast(&scope)); + auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_dst", PlaceType::kGPU, static_cast(&scope)); + + using paddle::platform::float16; + std::vector data_src(6, float16(1.0)); + tensor_src->Reshape({2, 3}); + tensor_src->CopyFromCpu(data_src.data()); + + std::vector data_dst(4, float16(2.0)); + tensor_dst->Reshape({2, 2}); + tensor_dst->CopyFromCpu(data_dst.data()); + + paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src); + + EXPECT_EQ(tensor_dst->shape().size(), (size_t)2); + EXPECT_EQ(tensor_dst->shape()[0], 2); + EXPECT_EQ(tensor_dst->shape()[1], 3); + + std::vector data_check(6, float16(1.0)); + tensor_dst->CopyToCpu(data_check.data()); + + for (int i = 0; i < 6; i++) { + EXPECT_TRUE(data_check[i] == float16(1.0)); + } +} + +TEST(CopyTensor, async_stream) { + paddle::framework::Scope scope; + auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_src", PlaceType::kGPU, static_cast(&scope)); + auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_dst", PlaceType::kGPU, static_cast(&scope)); + + std::vector data_src(6, 1.0); + tensor_src->Reshape({2, 3}); + tensor_src->CopyFromCpu(data_src.data()); + + std::vector data_dst(4, 2.0); + tensor_dst->Reshape({2, 2}); + tensor_dst->CopyFromCpu(data_dst.data()); + + cudaStream_t stream; + paddle_infer::contrib::TensorUtils::CopyTensorAsync( + tensor_dst.get(), *tensor_src, static_cast(&stream)); + + EXPECT_EQ(tensor_dst->shape().size(), (size_t)2); + EXPECT_EQ(tensor_dst->shape()[0], 2); + EXPECT_EQ(tensor_dst->shape()[1], 3); + + cudaStreamSynchronize(stream); + + std::vector data_check(6, 1.0); + tensor_dst->CopyToCpu(data_check.data()); + + for (int i = 0; i < 6; i++) { + EXPECT_NEAR(data_check[i], static_cast(1.0), 1e-5); + } +} + +TEST(CopyTensor, async_callback) { + paddle::framework::Scope scope; + auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_src", PlaceType::kCPU, static_cast(&scope)); + auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest( + "tensor_dst", PlaceType::kGPU, static_cast(&scope)); + + std::vector data_src(6, 1.0); + tensor_src->Reshape({2, 3}); + tensor_src->CopyFromCpu(data_src.data()); + + std::vector data_dst(4, 2.0); + tensor_dst->Reshape({2, 2}); + tensor_dst->CopyFromCpu(data_dst.data()); + + paddle_infer::contrib::TensorUtils::CopyTensorAsync( + tensor_dst.get(), *tensor_src, + [](void *cb_params) { + Tensor *tensor = static_cast(cb_params); + EXPECT_EQ(tensor->shape().size(), (size_t)2); + EXPECT_EQ(tensor->shape()[0], 2); + EXPECT_EQ(tensor->shape()[1], 3); + }, + static_cast(&(*tensor_dst))); + + cudaDeviceSynchronize(); +} + +} // namespace paddle_infer diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b7cf907b5d..6b3c150a0b 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -28,6 +28,7 @@ #include #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_infer_contrib.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/utils/io_utils.h" @@ -286,6 +287,12 @@ py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) { // NOLINT paddle::inference::SerializePDTensorToStream(&ss, tensor); return static_cast(ss.str()); } + +void CopyPaddleInferTensor(paddle_infer::Tensor &dst, + const paddle_infer::Tensor &src) { + return paddle_infer::contrib::TensorUtils::CopyTensor(&dst, src); +} + } // namespace void BindInferenceApi(py::module *m) { @@ -317,6 +324,7 @@ void BindInferenceApi(py::module *m) { new paddle_infer::Predictor(config)); return std::move(pred); }); + m->def("copy_tensor", &CopyPaddleInferTensor); m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes); m->def("get_version", &paddle_infer::GetVersion); diff --git a/python/paddle/inference/contrib/__init__.py b/python/paddle/inference/contrib/__init__.py new file mode 100644 index 0000000000..6f0ea85344 --- /dev/null +++ b/python/paddle/inference/contrib/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/inference/contrib/utils/__init__.py b/python/paddle/inference/contrib/utils/__init__.py new file mode 100644 index 0000000000..5a52525049 --- /dev/null +++ b/python/paddle/inference/contrib/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ....fluid.core import copy_tensor # noqa: F401 -- GitLab