未验证 提交 ac33c0ca 编写于 作者: S Shang Zhizhou 提交者: GitHub

Add copy from tensor (#34406)

* add api

* temp save

* revert

* copytocpu async ok

* fix style

* copy sync ok

* fix compile error

* fix compile error

* api done

* update python async api

* fix compile

* remove async python api; add c++ async unittest

* remove python async api

* update unittest

* update unittest

* add C++ unittest for copytensor

* add unittest

* update namespace utils to class TensorUtils

* add unittest

* update unittest

* update unittest

* update code style

* update code style

* update unittest
上级 223c01fd
......@@ -20,6 +20,10 @@ if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
if(WITH_INFERENCE_API_TEST)
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
endif(WITH_INFERENCE_API_TEST)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......
......@@ -28,14 +28,15 @@ if(WITH_MKLDNN)
endif()
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
if(WITH_CRYPTO)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
else()
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
endif()
if(WIN32)
......
......@@ -121,6 +121,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
}
PlaceType Tensor::place() const { return place_; }
template <typename T>
void Tensor::CopyFromCpu(const T *data) {
EAGER_GET_TENSOR;
......@@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
}
template <typename T>
void Tensor::CopyToCpu(T *data) {
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
void *cb_params) const {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
......@@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
// async, return stream
if (nullptr != exec_stream) {
*(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
// async with callback
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
// sync
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#endif
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
......@@ -261,6 +273,22 @@ void Tensor::CopyToCpu(T *data) {
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
}
}
template <typename T>
void Tensor::CopyToCpu(T *data) const {
CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
}
template <typename T>
void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
}
template <typename T>
void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
}
template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
......@@ -268,12 +296,38 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
int *size) const;
......@@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
Tensor::Tensor(void *scope) : scope_{scope} {
PADDLE_ENFORCE_NOT_NULL(scope_,
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle_infer {
namespace contrib {
using paddle::PaddleDType;
void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
#if defined(PADDLE_WITH_CUDA)
void* ptr = nullptr;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
return ptr;
#else
return nullptr;
#endif
}
void TensorUtils::CudaFreePinnedMemory(void* ptr) {
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
#endif
}
void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params) {
Tensor& dst = *p_dst;
dst.Reshape(src.shape());
PADDLE_ENFORCE(
src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
PADDLE_ENFORCE(
dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
// copy to cpu, gpu => cpu or cpu => cpu
if (dst.place() == PlaceType::kCPU) {
switch (src.type()) {
case PaddleDType::INT32:
src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT64:
src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT32:
src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
cb, cb_params);
break;
case PaddleDType::UINT8:
src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT8:
src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}
// gpu => gpu or cpu => gpu
} else {
#if defined(PADDLE_WITH_CUDA)
void* dst_data = nullptr;
void* src_data = nullptr;
size_t data_len = 0;
int data_size = 0;
PlaceType src_place;
switch (src.type()) {
case PaddleDType::INT32:
dst_data =
static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
data_len = data_size * sizeof(int32_t);
break;
case PaddleDType::INT64:
dst_data =
static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
data_len = data_size * sizeof(int64_t);
break;
case PaddleDType::FLOAT32:
dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
data_len = data_size * sizeof(float);
break;
case PaddleDType::UINT8:
dst_data =
static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
data_len = data_size * sizeof(uint8_t);
break;
case PaddleDType::INT8:
dst_data =
static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
data_len = data_size * sizeof(int8_t);
break;
case PaddleDType::FLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::CUDAPlace gpu_place(dst.device_);
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
pool.Get(gpu_place));
if (src.place() == PlaceType::kCPU) {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CPUPlace(), src_data, data_len,
dev_ctx->stream());
} else {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CUDAPlace(), src_data, data_len,
dev_ctx->stream());
}
if (nullptr != exec_stream) {
*(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not copy tensor to GPU CUDA place because paddle is not compiled "
"with CUDA."));
#endif
}
return;
}
void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
}
void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream) {
CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
}
void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
CallbackFunc cb, void* cb_params) {
CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
}
} // namespace contrib
} // namespace paddle_infer
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle_infer {
namespace contrib {
class TensorUtils {
public:
static void* CudaMallocPinnedMemory(size_t size);
static void CudaFreePinnedMemory(void* mem);
static void CopyTensor(Tensor* p_dst, const Tensor& src);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
void* cb_params);
private:
static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params);
};
} // namespace contrib
} // namespace paddle_infer
......@@ -18,6 +18,16 @@
namespace paddle_infer {
typedef void (*CallbackFunc)(void*);
#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
class InferApiTesterUtils;
#endif
namespace contrib {
class TensorUtils;
}
/// \brief Paddle data type.
enum DataType {
FLOAT32,
......@@ -72,7 +82,21 @@ class PD_INFER_DECL Tensor {
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
template <typename T>
void CopyToCpu(T* data);
void CopyToCpu(T* data) const;
/// \brief Copy the tensor data to the host memory asynchronously.
/// \param[out] data The tensor will copy the data to the address.
/// \param[out] exec_stream The tensor will excute copy in this stream(Only
/// GPU CUDA stream suppported now).
template <typename T>
void CopyToCpuAsync(T* data, void* exec_stream) const;
/// \brief Copy the tensor data to the host memory asynchronously.
/// \param[out] data The tensor will copy the data to the address.
/// \param[out] cb Callback function cb(cb_params) will be executed on the
/// host after all currently enqueued items in the stream have completed .
template <typename T>
void CopyToCpuAsync(T* data, CallbackFunc cb, void* cb_params) const;
/// \brief Return the shape of the Tensor.
std::vector<int> shape() const;
......@@ -92,12 +116,20 @@ class PD_INFER_DECL Tensor {
/// \return The data type of the tensor.
DataType type() const;
/// \brief Return the place type of the tensor.
/// \return The place type of the tensor.
PlaceType place() const;
protected:
explicit Tensor(void* scope);
void* FindTensor() const;
void SetPlace(PlaceType place, int device = -1);
void SetName(const std::string& name);
template <typename T>
void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr,
void* cb_params = nullptr) const;
std::string name_;
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
......@@ -107,6 +139,11 @@ class PD_INFER_DECL Tensor {
void* scope_{nullptr};
PlaceType place_;
int device_;
friend class paddle_infer::contrib::TensorUtils;
#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
friend class paddle_infer::InferApiTesterUtils;
#endif
};
} // namespace paddle_infer
......@@ -682,6 +682,11 @@ if(WITH_GPU)
inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RESNET50_MODEL_DIR})
inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RESNET50_MODEL_DIR})
set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30)
endif()
if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <cstring>
#include <numeric>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle_infer {
class InferApiTesterUtils {
public:
static std::unique_ptr<Tensor> CreateInferTensorForTest(
const std::string &name, PlaceType place, void *p_scope) {
auto var = static_cast<paddle::framework::Scope *>(p_scope)->Var(name);
var->GetMutable<paddle::framework::LoDTensor>();
std::unique_ptr<Tensor> res(new Tensor(p_scope));
res->input_or_output_ = true;
res->SetName(name);
res->SetPlace(place, 0 /*device id*/);
return res;
}
};
TEST(Tensor, copy_to_cpu_async_stream) {
LOG(INFO) << GetVersion();
UpdateDllFlag("conv_workspace_size_limit", "4000");
std::string model_dir = FLAGS_infer_model + "/model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableUseGpu(100, 0);
auto predictor = CreatePredictor(config);
auto pred_clone = predictor->Clone();
std::vector<int> in_shape = {1, 3, 318, 318};
int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
[](int &a, int &b) { return a * b; });
std::vector<float> input(in_num, 1.0);
const auto &input_names = predictor->GetInputNames();
auto input_tensor = predictor->GetInputHandle(input_names[0]);
input_tensor->Reshape(in_shape);
input_tensor->CopyFromCpu(input.data());
predictor->Run();
const auto &output_names = predictor->GetOutputNames();
auto output_tensor = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_tensor->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
float *out_data = static_cast<float *>(
contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
memset(out_data, 0, sizeof(float) * out_num);
std::vector<float> correct_out_data = {
127.78, 1.07353, -229.42, 1127.28, -177.365,
-292.412, -271.614, 466.054, 540.436, -214.223,
};
for (int i = 0; i < 100; i++) {
predictor->Run();
}
cudaStream_t stream;
output_tensor->CopyToCpuAsync(out_data, static_cast<void *>(&stream));
// sync
cudaStreamSynchronize(stream);
for (int i = 0; i < 10; i++) {
EXPECT_NEAR(out_data[i] / correct_out_data[i], 1.0, 1e-3);
}
contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
}
TEST(Tensor, copy_to_cpu_async_callback) {
LOG(INFO) << GetVersion();
UpdateDllFlag("conv_workspace_size_limit", "4000");
std::string model_dir = FLAGS_infer_model + "/model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableUseGpu(100, 0);
auto predictor = CreatePredictor(config);
auto pred_clone = predictor->Clone();
std::vector<int> in_shape = {1, 3, 318, 318};
int in_num = std::accumulate(in_shape.begin(), in_shape.end(), 1,
[](int &a, int &b) { return a * b; });
std::vector<float> input(in_num, 1.0);
const auto &input_names = predictor->GetInputNames();
auto input_tensor = predictor->GetInputHandle(input_names[0]);
input_tensor->Reshape(in_shape);
input_tensor->CopyFromCpu(input.data());
predictor->Run();
const auto &output_names = predictor->GetOutputNames();
auto output_tensor = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_tensor->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
float *out_data = static_cast<float *>(
contrib::TensorUtils::CudaMallocPinnedMemory(sizeof(float) * out_num));
memset(out_data, 0, sizeof(float) * out_num);
for (int i = 0; i < 100; i++) {
predictor->Run();
}
output_tensor->CopyToCpuAsync(
out_data,
[](void *cb_params) {
float *data = static_cast<float *>(cb_params);
std::vector<float> correct_out_data = {
127.78, 1.07353, -229.42, 1127.28, -177.365,
-292.412, -271.614, 466.054, 540.436, -214.223,
};
for (int i = 0; i < 10; i++) {
EXPECT_NEAR(data[i] / correct_out_data[i], 1.0, 1e-3);
}
},
static_cast<void *>(out_data));
cudaDeviceSynchronize();
contrib::TensorUtils::CudaFreePinnedMemory(static_cast<void *>(out_data));
}
template <class DTYPE>
static void test_copy_tensor(PlaceType src_place, PlaceType dst_place) {
paddle::framework::Scope scope;
auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_src", src_place, static_cast<void *>(&scope));
auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_dst", dst_place, static_cast<void *>(&scope));
std::vector<DTYPE> data_src(6, 1);
tensor_src->Reshape({2, 3});
tensor_src->CopyFromCpu(data_src.data());
std::vector<DTYPE> data_dst(4, 2);
tensor_dst->Reshape({2, 2});
tensor_dst->CopyFromCpu(data_dst.data());
paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
EXPECT_EQ(tensor_dst->shape()[0], 2);
EXPECT_EQ(tensor_dst->shape()[1], 3);
std::vector<DTYPE> data_check(6, 3);
tensor_dst->CopyToCpu<DTYPE>(static_cast<DTYPE *>(data_check.data()));
for (int i = 0; i < 6; i++) {
EXPECT_NEAR(data_check[i], 1, 1e-5);
}
}
TEST(CopyTensor, float32) {
test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kCPU);
test_copy_tensor<float>(PlaceType::kCPU, PlaceType::kGPU);
test_copy_tensor<float>(PlaceType::kGPU, PlaceType::kGPU);
}
TEST(CopyTensor, int32) {
test_copy_tensor<int32_t>(PlaceType::kCPU, PlaceType::kCPU);
test_copy_tensor<int32_t>(PlaceType::kGPU, PlaceType::kGPU);
}
TEST(CopyTensor, int64) {
test_copy_tensor<int64_t>(PlaceType::kCPU, PlaceType::kCPU);
test_copy_tensor<int64_t>(PlaceType::kGPU, PlaceType::kGPU);
}
TEST(CopyTensor, int8) {
test_copy_tensor<int8_t>(PlaceType::kCPU, PlaceType::kCPU);
test_copy_tensor<int8_t>(PlaceType::kGPU, PlaceType::kGPU);
}
TEST(CopyTensor, uint8) {
test_copy_tensor<uint8_t>(PlaceType::kCPU, PlaceType::kCPU);
test_copy_tensor<uint8_t>(PlaceType::kGPU, PlaceType::kGPU);
}
TEST(CopyTensor, float16) {
paddle::framework::Scope scope;
auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_dst", PlaceType::kCPU, static_cast<void *>(&scope));
using paddle::platform::float16;
std::vector<float16> data_src(6, float16(1.0));
tensor_src->Reshape({2, 3});
tensor_src->CopyFromCpu(data_src.data());
std::vector<float16> data_dst(4, float16(2.0));
tensor_dst->Reshape({2, 2});
tensor_dst->CopyFromCpu(data_dst.data());
paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
EXPECT_EQ(tensor_dst->shape()[0], 2);
EXPECT_EQ(tensor_dst->shape()[1], 3);
std::vector<float16> data_check(6, float16(1.0));
tensor_dst->CopyToCpu<float16>(data_check.data());
for (int i = 0; i < 6; i++) {
EXPECT_TRUE(data_check[i] == float16(1.0));
}
}
TEST(CopyTensor, float16_gpu) {
paddle::framework::Scope scope;
auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
using paddle::platform::float16;
std::vector<float16> data_src(6, float16(1.0));
tensor_src->Reshape({2, 3});
tensor_src->CopyFromCpu(data_src.data());
std::vector<float16> data_dst(4, float16(2.0));
tensor_dst->Reshape({2, 2});
tensor_dst->CopyFromCpu(data_dst.data());
paddle_infer::contrib::TensorUtils::CopyTensor(tensor_dst.get(), *tensor_src);
EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
EXPECT_EQ(tensor_dst->shape()[0], 2);
EXPECT_EQ(tensor_dst->shape()[1], 3);
std::vector<float16> data_check(6, float16(1.0));
tensor_dst->CopyToCpu<float16>(data_check.data());
for (int i = 0; i < 6; i++) {
EXPECT_TRUE(data_check[i] == float16(1.0));
}
}
TEST(CopyTensor, async_stream) {
paddle::framework::Scope scope;
auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_src", PlaceType::kGPU, static_cast<void *>(&scope));
auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
std::vector<float> data_src(6, 1.0);
tensor_src->Reshape({2, 3});
tensor_src->CopyFromCpu(data_src.data());
std::vector<float> data_dst(4, 2.0);
tensor_dst->Reshape({2, 2});
tensor_dst->CopyFromCpu(data_dst.data());
cudaStream_t stream;
paddle_infer::contrib::TensorUtils::CopyTensorAsync(
tensor_dst.get(), *tensor_src, static_cast<void *>(&stream));
EXPECT_EQ(tensor_dst->shape().size(), (size_t)2);
EXPECT_EQ(tensor_dst->shape()[0], 2);
EXPECT_EQ(tensor_dst->shape()[1], 3);
cudaStreamSynchronize(stream);
std::vector<float> data_check(6, 1.0);
tensor_dst->CopyToCpu<float>(data_check.data());
for (int i = 0; i < 6; i++) {
EXPECT_NEAR(data_check[i], static_cast<float>(1.0), 1e-5);
}
}
TEST(CopyTensor, async_callback) {
paddle::framework::Scope scope;
auto tensor_src = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_src", PlaceType::kCPU, static_cast<void *>(&scope));
auto tensor_dst = paddle_infer::InferApiTesterUtils::CreateInferTensorForTest(
"tensor_dst", PlaceType::kGPU, static_cast<void *>(&scope));
std::vector<float> data_src(6, 1.0);
tensor_src->Reshape({2, 3});
tensor_src->CopyFromCpu(data_src.data());
std::vector<float> data_dst(4, 2.0);
tensor_dst->Reshape({2, 2});
tensor_dst->CopyFromCpu(data_dst.data());
paddle_infer::contrib::TensorUtils::CopyTensorAsync(
tensor_dst.get(), *tensor_src,
[](void *cb_params) {
Tensor *tensor = static_cast<Tensor *>(cb_params);
EXPECT_EQ(tensor->shape().size(), (size_t)2);
EXPECT_EQ(tensor->shape()[0], 2);
EXPECT_EQ(tensor->shape()[1], 3);
},
static_cast<void *>(&(*tensor_dst)));
cudaDeviceSynchronize();
}
} // namespace paddle_infer
......@@ -28,6 +28,7 @@
#include <vector>
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/io_utils.h"
......@@ -286,6 +287,12 @@ py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) { // NOLINT
paddle::inference::SerializePDTensorToStream(&ss, tensor);
return static_cast<py::bytes>(ss.str());
}
void CopyPaddleInferTensor(paddle_infer::Tensor &dst,
const paddle_infer::Tensor &src) {
return paddle_infer::contrib::TensorUtils::CopyTensor(&dst, src);
}
} // namespace
void BindInferenceApi(py::module *m) {
......@@ -317,6 +324,7 @@ void BindInferenceApi(py::module *m) {
new paddle_infer::Predictor(config));
return std::move(pred);
});
m->def("copy_tensor", &CopyPaddleInferTensor);
m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
m->def("get_version", &paddle_infer::GetVersion);
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ....fluid.core import copy_tensor # noqa: F401
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册