未验证 提交 61591afe 编写于 作者: H heliqi 提交者: GitHub

[Inference]Fix the ort Backend multiple input bug (#43621)

* fix or backend many inputs bug

* fix or backend many inputs bug

* fix or backend many inputs bug

* fix or backend many inputs bug

* code format

* code format
上级 75144f13
...@@ -52,8 +52,9 @@ else() ...@@ -52,8 +52,9 @@ else()
) )
endif() endif()
include_directories(${ONNXRUNTIME_INC_DIR} # For ONNXRUNTIME code to include internal headers.
)# For ONNXRUNTIME code to include internal headers. include_directories(${ONNXRUNTIME_INC_DIR})
if(WIN32) if(WIN32)
set(ONNXRUNTIME_SOURCE_LIB set(ONNXRUNTIME_SOURCE_LIB
"${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll"
......
...@@ -34,15 +34,11 @@ set(PADDLE2ONNX_INC_DIR ...@@ -34,15 +34,11 @@ set(PADDLE2ONNX_INC_DIR
set(PADDLE2ONNX_LIB_DIR set(PADDLE2ONNX_LIB_DIR
"${PADDLE2ONNX_INSTALL_DIR}/lib" "${PADDLE2ONNX_INSTALL_DIR}/lib"
CACHE PATH "onnxruntime lib directory." FORCE) CACHE PATH "onnxruntime lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_LIB_DIR}")
"${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
include_directories(${PADDLE2ONNX_INC_DIR} # For PADDLE2ONNX code to include internal headers.
)# For PADDLE2ONNX code to include internal headers. include_directories(${PADDLE2ONNX_INC_DIR})
if(WIN32) if(WIN32)
set(PADDLE2ONNX_SOURCE_LIB
"${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
CACHE FILEPATH "Paddle2ONNX source library." FORCE)
set(PADDLE2ONNX_LIB set(PADDLE2ONNX_LIB
"${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.dll" "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.dll"
CACHE FILEPATH "paddle2onnx library." FORCE) CACHE FILEPATH "paddle2onnx library." FORCE)
...@@ -50,9 +46,6 @@ if(WIN32) ...@@ -50,9 +46,6 @@ if(WIN32)
"${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.lib" "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.lib"
CACHE FILEPATH "paddle2onnx compile library." FORCE) CACHE FILEPATH "paddle2onnx compile library." FORCE)
elseif(APPLE) elseif(APPLE)
set(PADDLE2ONNX_SOURCE_LIB
"${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
CACHE FILEPATH "Paddle2ONNX source library." FORCE)
set(PADDLE2ONNX_LIB set(PADDLE2ONNX_LIB
"${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib" "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
CACHE FILEPATH "PADDLE2ONNX library." FORCE) CACHE FILEPATH "PADDLE2ONNX library." FORCE)
...@@ -60,9 +53,6 @@ elseif(APPLE) ...@@ -60,9 +53,6 @@ elseif(APPLE)
"${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib" "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
CACHE FILEPATH "paddle2onnx compile library." FORCE) CACHE FILEPATH "paddle2onnx compile library." FORCE)
else() else()
set(PADDLE2ONNX_SOURCE_LIB
"${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.so"
CACHE FILEPATH "Paddle2ONNX source library." FORCE)
set(PADDLE2ONNX_LIB set(PADDLE2ONNX_LIB
"${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so" "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
CACHE FILEPATH "PADDLE2ONNX library." FORCE) CACHE FILEPATH "PADDLE2ONNX library." FORCE)
......
...@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
/* /*
* This file contains demo of mobilenet for tensorrt. * This file contains demo of mobilenet for onnxruntime backend.
*/ */
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include <algorithm>
#include <numeric>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "utils.h" // NOLINT #include "utils.h" // NOLINT
DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(modeldir, "", "Directory of the inference model.");
DEFINE_string(data, "", "path of data");
namespace paddle { namespace paddle {
namespace demo { namespace demo {
...@@ -39,8 +41,21 @@ void Main() { ...@@ -39,8 +41,21 @@ void Main() {
auto predictor = paddle_infer::CreatePredictor(config); auto predictor = paddle_infer::CreatePredictor(config);
// Inference. // Inference.
LOG(INFO) << "--- prepare input data ----";
std::vector<int> input_shape = {1, 3, 224, 224}; std::vector<int> input_shape = {1, 3, 224, 224};
std::vector<float> input_data(1 * 3 * 224 * 224, 1.0); std::vector<float> input_data;
std::string line;
std::ifstream file(FLAGS_data);
std::getline(file, line);
file.close();
std::vector<std::string> data_strs;
split(line, ' ', &data_strs);
int input_num = 0;
for (auto& d : data_strs) {
input_num += 1;
input_data.push_back(std::stof(d));
}
std::vector<float> out_data; std::vector<float> out_data;
out_data.resize(1000); out_data.resize(1000);
auto input_names = predictor->GetInputNames(); auto input_names = predictor->GetInputNames();
...@@ -53,7 +68,19 @@ void Main() { ...@@ -53,7 +68,19 @@ void Main() {
predictor->Run(); predictor->Run();
output_tensor->CopyToCpu(out_data.data()); output_tensor->CopyToCpu(out_data.data());
VLOG(3) << "output.size " << out_data.size(); std::vector<int> out_index(out_data.size());
std::iota(out_index.begin(), out_index.end(), 0);
std::sort(
out_index.begin(), out_index.end(), [&out_data](int index1, int index2) {
return out_data[index1] > out_data[index2];
});
LOG(INFO) << "output.size " << out_data.size()
<< " max_index:" << out_index[0];
CHECK_EQ(out_data.size(), 1000);
int max_index = out_index[0];
CHECK_EQ(max_index, 13);
float max_score = out_data[max_index];
CHECK_LE(fabs(max_score - 0.99981), 1e-4);
} }
} // namespace demo } // namespace demo
......
...@@ -52,15 +52,17 @@ if [ $7 == ON ]; then ...@@ -52,15 +52,17 @@ if [ $7 == ON ]; then
mkdir -p MobileNetV2 mkdir -p MobileNetV2
cd MobileNetV2 cd MobileNetV2
if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
echo "MobileNetV2.inference.model.tar.gz has been downloaded." rm -rf MobileNetV2.inference.model.tar.gz
else fi
# echo "MobileNetV2.inference.model.tar.gz has been downloaded."
# else
if [ $WIN_DETECT != "" ]; then if [ $WIN_DETECT != "" ]; then
wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
else else
wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
fi fi
tar xzf *.tar.gz tar xzf *.tar.gz
fi # fi
cd .. cd ..
fi fi
...@@ -265,7 +267,8 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -265,7 +267,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
make -j$(nproc) make -j$(nproc)
./onnxruntime_mobilenet_demo \ ./onnxruntime_mobilenet_demo \
--modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 \
--data=$DATA_DIR/MobileNetV2/MobileNetV2/data.txt
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "onnxruntime_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt echo "onnxruntime_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt
EXIT_CODE=1 EXIT_CODE=1
......
...@@ -40,17 +40,20 @@ void Tensor::Reshape(const std::vector<int> &shape) { ...@@ -40,17 +40,20 @@ void Tensor::Reshape(const std::vector<int> &shape) {
#endif #endif
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
name_.empty(), false, name_.empty(),
false,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can " "Need to SetName first, so that the corresponding tensor can "
"be retrieved.")); "be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true, PADDLE_ENFORCE_EQ(input_or_output_,
true,
paddle::platform::errors::PermissionDenied( paddle::platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly")); "Can't reshape the output tensor, it is readonly"));
auto *scope = static_cast<paddle::framework::Scope *>(scope_); auto *scope = static_cast<paddle::framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet( var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_)); "No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize(phi::make_ddim(shape)); tensor->Resize(phi::make_ddim(shape));
...@@ -58,17 +61,20 @@ void Tensor::Reshape(const std::vector<int> &shape) { ...@@ -58,17 +61,20 @@ void Tensor::Reshape(const std::vector<int> &shape) {
void Tensor::ReshapeStrings(const size_t &shape) { void Tensor::ReshapeStrings(const size_t &shape) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
name_.empty(), false, name_.empty(),
false,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can " "Need to SetName first, so that the corresponding tensor can "
"be retrieved.")); "be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true, PADDLE_ENFORCE_EQ(input_or_output_,
true,
paddle::platform::errors::PermissionDenied( paddle::platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly")); "Can't reshape the output tensor, it is readonly"));
auto *scope = static_cast<paddle::framework::Scope *>(scope_); auto *scope = static_cast<paddle::framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet( var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_)); "No tensor called [%s] in the runtime scope", name_));
paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>(); paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
tensor->resize(shape); tensor->resize(shape);
...@@ -84,7 +90,8 @@ template <typename T> ...@@ -84,7 +90,8 @@ template <typename T>
T *Tensor::mutable_data(PlaceType place) { T *Tensor::mutable_data(PlaceType place) {
EAGER_GET_TENSOR(paddle::framework::LoDTensor); EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
tensor->numel(), 0, tensor->numel(),
0,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const std::vector<int> " "You should call Tensor::Reshape(const std::vector<int> "
"&shape)" "&shape)"
...@@ -97,8 +104,9 @@ T *Tensor::mutable_data(PlaceType place) { ...@@ -97,8 +104,9 @@ T *Tensor::mutable_data(PlaceType place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::CUDAPlace gpu_place(device_); paddle::platform::CUDAPlace gpu_place(device_);
auto *dev_ctxs = reinterpret_cast<const std::map< auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> phi::Place,
*>(device_contexs_); std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
device_contexs_);
auto *dev_ctx = auto *dev_ctx =
static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get()); static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
return dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T)); return dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
...@@ -179,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -179,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
#endif #endif
EAGER_GET_TENSOR(paddle::framework::LoDTensor); EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GE(tensor->numel(), 0, PADDLE_ENFORCE_GE(tensor->numel(),
0,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const " "You should call Tensor::Reshape(const "
"std::vector<int> &shape)" "std::vector<int> &shape)"
...@@ -194,14 +203,18 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -194,14 +203,18 @@ void Tensor::CopyFromCpu(const T *data) {
paddle::platform::CUDAPlace gpu_place(device_); paddle::platform::CUDAPlace gpu_place(device_);
auto *dev_ctxs = reinterpret_cast<const std::map< auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>( phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
device_contexs_); device_contexs_);
auto *dev_ctx = auto *dev_ctx =
static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get()); static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
auto *t_data = dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T)); auto *t_data = dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
paddle::memory::Copy(gpu_place, static_cast<void *>(t_data), paddle::memory::Copy(gpu_place,
paddle::platform::CPUPlace(), data, ele_size, static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size,
dev_ctx->stream()); dev_ctx->stream());
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
...@@ -212,8 +225,11 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -212,8 +225,11 @@ void Tensor::CopyFromCpu(const T *data) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
paddle::platform::XPUPlace xpu_place(device_); paddle::platform::XPUPlace xpu_place(device_);
auto *t_data = tensor->mutable_data<T>(xpu_place); auto *t_data = tensor->mutable_data<T>(xpu_place);
paddle::memory::Copy(xpu_place, static_cast<void *>(t_data), paddle::memory::Copy(xpu_place,
paddle::platform::CPUPlace(), data, ele_size); static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size);
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with XPU place because paddle is not compiled " "Can not create tensor with XPU place because paddle is not compiled "
...@@ -227,8 +243,11 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -227,8 +243,11 @@ void Tensor::CopyFromCpu(const T *data) {
auto *t_data = tensor->mutable_data<T>(npu_place); auto *t_data = tensor->mutable_data<T>(npu_place);
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place)); pool.Get(npu_place));
paddle::memory::Copy(npu_place, static_cast<void *>(t_data), paddle::memory::Copy(npu_place,
paddle::platform::CPUPlace(), data, ele_size, static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size,
dev_ctx->stream()); dev_ctx->stream());
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
...@@ -246,8 +265,11 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -246,8 +265,11 @@ void Tensor::CopyFromCpu(const T *data) {
auto *t_data = tensor->mutable_data<T>(custom_place); auto *t_data = tensor->mutable_data<T>(custom_place);
auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
pool.Get(custom_place)); pool.Get(custom_place));
paddle::memory::Copy(custom_place, static_cast<void *>(t_data), paddle::memory::Copy(custom_place,
paddle::platform::CPUPlace(), data, ele_size, static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size,
dev_ctx->stream()); dev_ctx->stream());
#else #else
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
...@@ -291,30 +313,33 @@ struct DataTypeInfo<int32_t> { ...@@ -291,30 +313,33 @@ struct DataTypeInfo<int32_t> {
paddle::experimental::DataLayout LayoutConvert(DataLayout layout) { paddle::experimental::DataLayout LayoutConvert(DataLayout layout) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
layout, DataLayout::kNCHW, layout,
DataLayout::kNCHW,
paddle::platform::errors::InvalidArgument("Only NCHW is supported now.")); paddle::platform::errors::InvalidArgument("Only NCHW is supported now."));
return paddle::experimental::DataLayout::NCHW; return paddle::experimental::DataLayout::NCHW;
} }
template <typename T> template <typename T>
void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape, void Tensor::ShareExternalData(const T *data,
PlaceType place, DataLayout layout) { const std::vector<int> &shape,
PlaceType place,
DataLayout layout) {
EAGER_GET_TENSOR(paddle::framework::LoDTensor) EAGER_GET_TENSOR(paddle::framework::LoDTensor)
size_t size = size_t size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) * std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T); sizeof(T);
phi::DenseTensorMeta meta(DataTypeInfo<T>().TYPE, phi::make_ddim(shape), phi::DenseTensorMeta meta(
LayoutConvert(layout)); DataTypeInfo<T>().TYPE, phi::make_ddim(shape), LayoutConvert(layout));
if (place == PlaceType::kCPU) { if (place == PlaceType::kCPU) {
phi::DenseTensor dtensor( phi::DenseTensor dtensor(
std::make_shared<phi::Allocation>(const_cast<T *>(data), size, std::make_shared<phi::Allocation>(
paddle::platform::CPUPlace()), const_cast<T *>(data), size, paddle::platform::CPUPlace()),
meta); meta);
*tensor = std::move(dtensor); *tensor = std::move(dtensor);
} else if (place == PlaceType::kGPU) { } else if (place == PlaceType::kGPU) {
phi::DenseTensor dtensor( phi::DenseTensor dtensor(
std::make_shared<phi::Allocation>(const_cast<T *>(data), size, std::make_shared<phi::Allocation>(
paddle::platform::CUDAPlace(device_)), const_cast<T *>(data), size, paddle::platform::CUDAPlace(device_)),
meta); meta);
*tensor = std::move(dtensor); *tensor = std::move(dtensor);
} else { } else {
...@@ -325,7 +350,8 @@ void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape, ...@@ -325,7 +350,8 @@ void Tensor::ShareExternalData(const T *data, const std::vector<int> &shape,
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
EAGER_GET_TENSOR(paddle_infer::Strings); EAGER_GET_TENSOR(paddle_infer::Strings);
PADDLE_ENFORCE_GE(tensor->size(), 0, PADDLE_ENFORCE_GE(tensor->size(),
0,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const " "You should call Tensor::Reshape(const "
"std::size_t &shape)function before copying" "std::size_t &shape)function before copying"
...@@ -334,7 +360,9 @@ void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { ...@@ -334,7 +360,9 @@ void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
} }
template <typename T> template <typename T>
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, void Tensor::CopyToCpuImpl(T *data,
void *exec_stream,
CallbackFunc cb,
void *cb_params) const { void *cb_params) const {
EAGER_GET_TENSOR(paddle::framework::LoDTensor); EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto ele_num = tensor->numel(); auto ele_num = tensor->numel();
...@@ -344,7 +372,8 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -344,7 +372,8 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
paddle::framework::Tensor out; paddle::framework::Tensor out;
auto mem_allocation = auto mem_allocation =
std::make_shared<paddle::memory::allocation::Allocation>( std::make_shared<paddle::memory::allocation::Allocation>(
static_cast<void *>(data), ele_num * sizeof(T), static_cast<void *>(data),
ele_num * sizeof(T),
paddle::platform::CPUPlace()); paddle::platform::CPUPlace());
out.ResetHolder(mem_allocation); out.ResetHolder(mem_allocation);
...@@ -355,7 +384,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -355,7 +384,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
tensor->layout(), tensor->layout(),
paddle::platform::MKLDNNDeviceContext::tls() paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout(), .get_cur_paddle_data_layout(),
*tensor, &out, paddle::platform::CPUPlace(), true); *tensor,
&out,
paddle::platform::CPUPlace(),
true);
else else
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T)); std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#else #else
...@@ -373,13 +405,17 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -373,13 +405,17 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto gpu_place = t_place; auto gpu_place = t_place;
auto *dev_ctxs = reinterpret_cast<const std::map< auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>( phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
device_contexs_); device_contexs_);
auto *dev_ctx = auto *dev_ctx =
static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get()); static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), gpu_place, t_data, static_cast<void *>(data),
ele_num * sizeof(T), dev_ctx->stream()); gpu_place,
t_data,
ele_num * sizeof(T),
dev_ctx->stream());
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream()); hipStreamSynchronize(dev_ctx->stream());
#else #else
...@@ -403,7 +439,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -403,7 +439,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto xpu_place = t_place; auto xpu_place = t_place;
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), xpu_place, t_data, static_cast<void *>(data),
xpu_place,
t_data,
ele_num * sizeof(T)); ele_num * sizeof(T));
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
...@@ -418,8 +456,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -418,8 +456,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place)); pool.Get(npu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), npu_place, t_data, static_cast<void *>(data),
ele_num * sizeof(T), dev_ctx->stream()); npu_place,
t_data,
ele_num * sizeof(T),
dev_ctx->stream());
paddle::platform::NPUStreamSync(dev_ctx->stream()); paddle::platform::NPUStreamSync(dev_ctx->stream());
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
...@@ -434,8 +475,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, ...@@ -434,8 +475,11 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>( auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
pool.Get(custom_place)); pool.Get(custom_place));
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), custom_place, t_data, static_cast<void *>(data),
ele_num * sizeof(T), dev_ctx->stream()); custom_place,
t_data,
ele_num * sizeof(T),
dev_ctx->stream());
// TODO(wangran16): sync_stream // TODO(wangran16): sync_stream
#else #else
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
...@@ -474,22 +518,34 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data); ...@@ -474,22 +518,34 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data); template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
template PD_INFER_DECL void Tensor::ShareExternalData<float>( template PD_INFER_DECL void Tensor::ShareExternalData<float>(
const float *data, const std::vector<int> &shape, PlaceType place, const float *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>( template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>(
const int64_t *data, const std::vector<int> &shape, PlaceType place, const int64_t *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>( template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>(
const int32_t *data, const std::vector<int> &shape, PlaceType place, const int32_t *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>( template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>(
const uint8_t *data, const std::vector<int> &shape, PlaceType place, const uint8_t *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>( template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>(
const int8_t *data, const std::vector<int> &shape, PlaceType place, const int8_t *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<float16>( template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
const float16 *data, const std::vector<int> &shape, PlaceType place, const float16 *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout); DataLayout layout);
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const; template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
...@@ -566,14 +622,16 @@ Tensor::Tensor(void *scope, const void *device_contexts) ...@@ -566,14 +622,16 @@ Tensor::Tensor(void *scope, const void *device_contexts)
template <typename T> template <typename T>
void *Tensor::FindTensor() const { void *Tensor::FindTensor() const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
name_.empty(), false, name_.empty(),
false,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can " "Need to SetName first, so that the corresponding tensor can "
"be retrieved.")); "be retrieved."));
auto *scope = static_cast<paddle::framework::Scope *>(scope_); auto *scope = static_cast<paddle::framework::Scope *>(scope_);
auto *var = scope->FindVar(name_); auto *var = scope->FindVar(name_);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet( var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_)); "No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<T>(); auto *tensor = var->GetMutable<T>();
return tensor; return tensor;
...@@ -602,7 +660,8 @@ std::vector<int> Tensor::shape() const { ...@@ -602,7 +660,8 @@ std::vector<int> Tensor::shape() const {
#endif #endif
EAGER_GET_TENSOR(paddle::framework::LoDTensor); EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
tensor_, paddle::platform::errors::PreconditionNotMet( tensor_,
paddle::platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_)); "Not found tensor called %s in the scope", name_));
// mkldnn may does layout transform internally, so need to reorder before // mkldnn may does layout transform internally, so need to reorder before
// return // return
...@@ -668,40 +727,65 @@ void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) { ...@@ -668,40 +727,65 @@ void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
binding_ = binding; binding_ = binding;
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data, void Tensor::SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer) {
size_t size, const int64_t *shape, size_t shape_len) { buffer_ = buffer;
return Ort::Value::CreateTensor<float>(memory_info, data, size, shape, }
shape_len);
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
float *data,
size_t size,
const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor<float>(
memory_info, data, size, shape, shape_len);
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data, Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
size_t size, const int64_t *shape, size_t shape_len) { int64_t *data,
return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape, size_t size,
shape_len); const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor<int64_t>(
memory_info, data, size, shape, shape_len);
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data, Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
size_t size, const int64_t *shape, size_t shape_len) { int32_t *data,
return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape, size_t size,
shape_len); const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor<int32_t>(
memory_info, data, size, shape, shape_len);
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data, Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
size_t size, const int64_t *shape, size_t shape_len) { uint8_t *data,
return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape, size_t size,
shape_len); const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor<uint8_t>(
memory_info, data, size, shape, shape_len);
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data, Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
size_t size, const int64_t *shape, size_t shape_len) { int8_t *data,
return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape, size_t size,
shape_len); const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor<int8_t>(
memory_info, data, size, shape, shape_len);
} }
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data, Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
size_t size, const int64_t *shape, size_t shape_len) { float16 *data,
return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data), size_t size,
size * sizeof(float16), shape, shape_len, const int64_t *shape,
size_t shape_len) {
return Ort::Value::CreateTensor(memory_info,
static_cast<void *>(data),
size * sizeof(float16),
shape,
shape_len,
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
} }
...@@ -712,15 +796,16 @@ void Tensor::ORTCopyFromCpu(const T *data) { ...@@ -712,15 +796,16 @@ void Tensor::ORTCopyFromCpu(const T *data) {
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"input tensor [%s] no binding ptr", name_)); "input tensor [%s] no binding ptr", name_));
const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_, Ort::MemoryInfo memory_info(
OrtMemTypeDefault); device_name, OrtDeviceAllocator, device_, OrtMemTypeDefault);
size_t size = std::accumulate(begin(shape_), end(shape_), 1UL, size_t size = std::accumulate(
std::multiplies<size_t>()); begin(shape_), end(shape_), 1UL, std::multiplies<size_t>());
auto buffer = buffer_.lock();
size_t buffer_size = size * sizeof(T); size_t buffer_size = size * sizeof(T);
if (buffer_size > buffer_.size()) { if (buffer_size > buffer->size()) {
buffer_.resize(buffer_size); buffer->resize(buffer_size);
} }
std::memcpy(static_cast<void *>(buffer_.data()), data, buffer_size); std::memcpy(static_cast<void *>(buffer->data()), data, buffer_size);
auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -737,18 +822,18 @@ void Tensor::ORTCopyFromCpu(const T *data) { ...@@ -737,18 +822,18 @@ void Tensor::ORTCopyFromCpu(const T *data) {
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
} else if (std::is_same<T, float16>::value) { } else if (std::is_same<T, float16>::value) {
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
} } else {
if (onnx_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Found undefined data type for onnxruntime, only supports " "Found undefined data type for onnxruntime, only supports "
"float16/float32/float64/int8/uint8/int32/int64.")); "float16/float32/float64/int8/uint8/int32/int64."));
} }
auto ort_value = auto ort_value = Ort::Value::CreateTensor(memory_info,
Ort::Value::CreateTensor(memory_info, buffer_.data(), buffer_size, buffer->data(),
shape_.data(), shape_.size(), onnx_dtype); buffer_size,
shape_.data(),
shape_.size(),
onnx_dtype);
binding->BindInput(name_.c_str(), ort_value); binding->BindInput(name_.c_str(), ort_value);
} }
...@@ -793,21 +878,24 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, ...@@ -793,21 +878,24 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
cudaStream_t stream) { cudaStream_t stream) {
if (t->tensor_ == nullptr) { if (t->tensor_ == nullptr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
t->name_.empty(), false, t->name_.empty(),
false,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can " "Need to SetName first, so that the corresponding tensor can "
"be retrieved.")); "be retrieved."));
auto *scope = static_cast<paddle::framework::Scope *>(t->scope_); auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
auto *var = scope->FindVar(t->name_); auto *var = scope->FindVar(t->name_);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet( var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", t->name_)); "No tensor called [%s] in the runtime scope", t->name_));
auto *tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
t->tensor_ = tensor; t->tensor_ = tensor;
} }
auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_); auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
PADDLE_ENFORCE_GE(tensor->numel(), 0, PADDLE_ENFORCE_GE(tensor->numel(),
0,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const " "You should call Tensor::Reshape(const "
"std::vector<int> &shape)" "std::vector<int> &shape)"
...@@ -820,8 +908,12 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, ...@@ -820,8 +908,12 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::CUDAPlace gpu_place(t->device_); paddle::platform::CUDAPlace gpu_place(t->device_);
auto *t_data = tensor->mutable_data<T>(gpu_place); auto *t_data = tensor->mutable_data<T>(gpu_place);
paddle::memory::Copy(gpu_place, static_cast<void *>(t_data), paddle::memory::Copy(gpu_place,
paddle::platform::CPUPlace(), data, ele_size, stream); static_cast<void *>(t_data),
paddle::platform::CPUPlace(),
data,
ele_size,
stream);
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with CUDA place because paddle is not compiled " "Can not create tensor with CUDA place because paddle is not compiled "
...@@ -834,18 +926,21 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t, ...@@ -834,18 +926,21 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
} }
template <typename T> template <typename T>
void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
T *data,
cudaStream_t stream) { cudaStream_t stream) {
if (t->tensor_ == nullptr) { if (t->tensor_ == nullptr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
t->name_.empty(), false, t->name_.empty(),
false,
paddle::platform::errors::PreconditionNotMet( paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can " "Need to SetName first, so that the corresponding tensor can "
"be retrieved.")); "be retrieved."));
auto *scope = static_cast<paddle::framework::Scope *>(t->scope_); auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
auto *var = scope->FindVar(t->name_); auto *var = scope->FindVar(t->name_);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet( var,
paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", t->name_)); "No tensor called [%s] in the runtime scope", t->name_));
auto *tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
t->tensor_ = tensor; t->tensor_ = tensor;
...@@ -859,7 +954,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, ...@@ -859,7 +954,8 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
paddle::framework::Tensor out; paddle::framework::Tensor out;
auto mem_allocation = auto mem_allocation =
std::make_shared<paddle::memory::allocation::Allocation>( std::make_shared<paddle::memory::allocation::Allocation>(
static_cast<void *>(data), ele_num * sizeof(T), static_cast<void *>(data),
ele_num * sizeof(T),
paddle::platform::CPUPlace()); paddle::platform::CPUPlace());
out.ResetHolder(mem_allocation); out.ResetHolder(mem_allocation);
...@@ -870,7 +966,10 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, ...@@ -870,7 +966,10 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
tensor->layout(), tensor->layout(),
paddle::platform::MKLDNNDeviceContext::tls() paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout(), .get_cur_paddle_data_layout(),
*tensor, &out, paddle::platform::CPUPlace(), true); *tensor,
&out,
paddle::platform::CPUPlace(),
true);
else else
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T)); std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#else #else
...@@ -879,8 +978,11 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data, ...@@ -879,8 +978,11 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
} else if (t->place_ == PlaceType::kGPU) { } else if (t->place_ == PlaceType::kGPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::memory::Copy(paddle::platform::CPUPlace(), paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), t_place, t_data, static_cast<void *>(data),
ele_num * sizeof(T), stream); t_place,
t_data,
ele_num * sizeof(T),
stream);
#else #else
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with CUDA place because paddle is not compiled " "Can not create tensor with CUDA place because paddle is not compiled "
......
...@@ -71,13 +71,16 @@ bool CheckConvertToONNX(const AnalysisConfig &config) { ...@@ -71,13 +71,16 @@ bool CheckConvertToONNX(const AnalysisConfig &config) {
} else if (config.prog_file().empty() || config.params_file().empty()) { } else if (config.prog_file().empty() || config.params_file().empty()) {
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s' or params path '%s'.", "not valid model path '%s' or program path '%s' or params path '%s'.",
config.model_dir(), config.prog_file(), config.params_file()); config.model_dir(),
config.prog_file(),
config.params_file());
return false; return false;
} }
if (config.model_from_memory()) { if (config.model_from_memory()) {
return paddle2onnx::IsExportable( return paddle2onnx::IsExportable(config.prog_file().data(),
config.prog_file().data(), config.prog_file().size(), config.prog_file().size(),
config.params_file().data(), config.params_file().size()); config.params_file().data(),
config.params_file().size());
} else { } else {
return paddle2onnx::IsExportable(config.prog_file().c_str(), return paddle2onnx::IsExportable(config.prog_file().c_str(),
config.params_file().c_str()); config.params_file().c_str());
...@@ -98,12 +101,17 @@ bool ONNXRuntimePredictor::Init() { ...@@ -98,12 +101,17 @@ bool ONNXRuntimePredictor::Init() {
char *onnx_proto = nullptr; char *onnx_proto = nullptr;
int out_size; int out_size;
if (config_.model_from_memory()) { if (config_.model_from_memory()) {
paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(), paddle2onnx::Export(config_.prog_file().data(),
config_.prog_file().size(),
config_.params_file().data(), config_.params_file().data(),
config_.params_file().size(), &onnx_proto, &out_size); config_.params_file().size(),
&onnx_proto,
&out_size);
} else { } else {
paddle2onnx::Export(config_.prog_file().c_str(), paddle2onnx::Export(config_.prog_file().c_str(),
config_.params_file().c_str(), &onnx_proto, &out_size); config_.params_file().c_str(),
&onnx_proto,
&out_size);
} }
Ort::SessionOptions session_options; Ort::SessionOptions session_options;
...@@ -134,8 +142,8 @@ bool ONNXRuntimePredictor::Init() { ...@@ -134,8 +142,8 @@ bool ONNXRuntimePredictor::Init() {
session_ = {env_, onnx_proto, static_cast<size_t>(out_size), session_options}; session_ = {env_, onnx_proto, static_cast<size_t>(out_size), session_options};
binding_ = std::make_shared<Ort::IoBinding>(session_); binding_ = std::make_shared<Ort::IoBinding>(session_);
Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, Ort::MemoryInfo memory_info(
place_.GetDeviceId(), OrtMemTypeDefault); device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault);
Ort::Allocator allocator(session_, memory_info); Ort::Allocator allocator(session_, memory_info);
size_t n_inputs = session_.GetInputCount(); size_t n_inputs = session_.GetInputCount();
...@@ -160,8 +168,10 @@ bool ONNXRuntimePredictor::Init() { ...@@ -160,8 +168,10 @@ bool ONNXRuntimePredictor::Init() {
type_info.GetTensorTypeAndShapeInfo().GetElementType(); type_info.GetTensorTypeAndShapeInfo().GetElementType();
output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, Ort::MemoryInfo out_memory_info(device_name,
place_.GetDeviceId(), OrtMemTypeDefault); OrtDeviceAllocator,
place_.GetDeviceId(),
OrtMemTypeDefault);
binding_->BindOutput(output_name, out_memory_info); binding_->BindOutput(output_name, out_memory_info);
allocator.Free(output_name); allocator.Free(output_name);
...@@ -181,7 +191,8 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>( ...@@ -181,7 +191,8 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
config.is_valid(), true, config.is_valid(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Note: Each config can only be used for one predictor.")); "Note: Each config can only be used for one predictor."));
...@@ -238,7 +249,8 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, ...@@ -238,7 +249,8 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor( std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
const std::string &name) { const std::string &name) {
PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, PADDLE_ENFORCE_EQ(FindONNXDesc(name, true),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The in variable named %s is not found in the " "The in variable named %s is not found in the "
"ONNXPredictor.", "ONNXPredictor.",
...@@ -254,12 +266,21 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor( ...@@ -254,12 +266,21 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
} }
res->SetOrtMark(true); res->SetOrtMark(true);
res->SetOrtBinding(binding_); res->SetOrtBinding(binding_);
auto iter = input_buffers_.find(name);
if (iter == input_buffers_.end()) {
std::vector<int8_t> i_vector;
input_buffers_[name] = std::make_shared<std::vector<int8_t>>(i_vector);
res->SetOrtBuffer(input_buffers_[name]);
} else {
res->SetOrtBuffer(iter->second);
}
return res; return res;
} }
std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor( std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
const std::string &name) { const std::string &name) {
PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, PADDLE_ENFORCE_EQ(FindONNXDesc(name, false),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The out variable named %s is not found in the " "The out variable named %s is not found in the "
"ONNXPredictor.", "ONNXPredictor.",
...@@ -296,8 +317,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() { ...@@ -296,8 +317,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() {
try { try {
const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
for (auto output : output_desc_) { for (auto output : output_desc_) {
Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, Ort::MemoryInfo out_memory_info(device_name,
place_.GetDeviceId(), OrtMemTypeDefault); OrtDeviceAllocator,
place_.GetDeviceId(),
OrtMemTypeDefault);
binding_->BindOutput(output.name.c_str(), out_memory_info); binding_->BindOutput(output.name.c_str(), out_memory_info);
} }
session_.Run({}, *(binding_.get())); session_.Run({}, *(binding_.get()));
...@@ -330,8 +353,9 @@ const void *ONNXRuntimePredictor::GetDeviceContexts() const { ...@@ -330,8 +353,9 @@ const void *ONNXRuntimePredictor::GetDeviceContexts() const {
paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
const auto &dev_ctxs = pool.device_contexts(); const auto &dev_ctxs = pool.device_contexts();
return &const_cast<std::map< return &const_cast<
phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> &>( std::map<phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> &>(
dev_ctxs); dev_ctxs);
} }
......
...@@ -202,6 +202,7 @@ class ONNXRuntimePredictor : public PaddlePredictor { ...@@ -202,6 +202,7 @@ class ONNXRuntimePredictor : public PaddlePredictor {
platform::Place place_; platform::Place place_;
std::vector<ONNXDesc> input_desc_; std::vector<ONNXDesc> input_desc_;
std::vector<ONNXDesc> output_desc_; std::vector<ONNXDesc> output_desc_;
std::map<std::string, std::shared_ptr<std::vector<int8_t>>> input_buffers_;
int predictor_id_; int predictor_id_;
// Some more detailed tests, they are made the friends of the predictor, so that // Some more detailed tests, they are made the friends of the predictor, so that
......
...@@ -110,7 +110,8 @@ class PD_INFER_DECL Tensor { ...@@ -110,7 +110,8 @@ class PD_INFER_DECL Tensor {
/// \param place The place of data. /// \param place The place of data.
/// \param layout The layout of data. Only NCHW is supported now. /// \param layout The layout of data. Only NCHW is supported now.
template <typename T> template <typename T>
void ShareExternalData(const T* data, const std::vector<int>& shape, void ShareExternalData(const T* data,
const std::vector<int>& shape,
PlaceType place, PlaceType place,
DataLayout layout = DataLayout::kNCHW); DataLayout layout = DataLayout::kNCHW);
...@@ -171,7 +172,9 @@ class PD_INFER_DECL Tensor { ...@@ -171,7 +172,9 @@ class PD_INFER_DECL Tensor {
void SetName(const std::string& name); void SetName(const std::string& name);
template <typename T> template <typename T>
void CopyToCpuImpl(T* data, void* stream = nullptr, CallbackFunc cb = nullptr, void CopyToCpuImpl(T* data,
void* stream = nullptr,
CallbackFunc cb = nullptr,
void* cb_params = nullptr) const; void* cb_params = nullptr) const;
std::string name_; std::string name_;
...@@ -188,7 +191,7 @@ class PD_INFER_DECL Tensor { ...@@ -188,7 +191,7 @@ class PD_INFER_DECL Tensor {
#ifdef PADDLE_WITH_ONNXRUNTIME #ifdef PADDLE_WITH_ONNXRUNTIME
bool is_ort_tensor_{false}; bool is_ort_tensor_{false};
std::vector<int64_t> shape_; std::vector<int64_t> shape_;
std::vector<int8_t> buffer_; std::weak_ptr<std::vector<int8_t>> buffer_;
std::weak_ptr<Ort::IoBinding> binding_; std::weak_ptr<Ort::IoBinding> binding_;
int idx_{-1}; int idx_{-1};
...@@ -196,6 +199,8 @@ class PD_INFER_DECL Tensor { ...@@ -196,6 +199,8 @@ class PD_INFER_DECL Tensor {
void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding); void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
void SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer);
template <typename T> template <typename T>
void ORTCopyFromCpu(const T* data); void ORTCopyFromCpu(const T* data);
......
...@@ -384,12 +384,12 @@ if(WITH_PYTHON) ...@@ -384,12 +384,12 @@ if(WITH_PYTHON)
set(PADDLE2ONNX_PYBIND_OUT set(PADDLE2ONNX_PYBIND_OUT
${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib) ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
set(ONNXRUNTIME_PYBIND_OUT set(ONNXRUNTIME_PYBIND_OUT
${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib) ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.1.10.0.dylib)
else() else()
set(PADDLE2ONNX_PYBIND_OUT set(PADDLE2ONNX_PYBIND_OUT
${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so) ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
set(ONNXRUNTIME_PYBIND_OUT set(ONNXRUNTIME_PYBIND_OUT
${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so) ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so.1.10.0)
endif() endif()
add_custom_command( add_custom_command(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册