提交 bcd67bdd 编写于 作者: N nhzlx

add assert for GetOutput

上级 7382f986
...@@ -139,11 +139,11 @@ class TRTConvertValidation { ...@@ -139,11 +139,11 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 200; const size_t output_space_size = 2000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0]); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License"); you may not use
you may not use this file except in compliance with the License. this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -149,7 +149,8 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { ...@@ -149,7 +149,8 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer; return buffer(name).buffer;
} }
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) { void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name); auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions(); nvinfer1::Dims dims = output->getDimensions();
...@@ -161,6 +162,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) { ...@@ -161,6 +162,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) {
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
...@@ -168,7 +170,8 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) { ...@@ -168,7 +170,8 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) {
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) { void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name); auto *output = TensorRTEngine::GetITensor(name);
...@@ -180,6 +183,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) { ...@@ -180,6 +183,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) {
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
......
...@@ -106,10 +106,10 @@ class TensorRTEngine : public EngineBase { ...@@ -106,10 +106,10 @@ class TensorRTEngine : public EngineBase {
// Return the output's GPU memory address without copy. // Return the output's GPU memory address without copy.
void* GetOutputInGPU(const std::string& name); void* GetOutputInGPU(const std::string& name);
// Copy data into dst inside the GPU device. // Copy data into dst inside the GPU device.
void GetOutputInGPU(const std::string& name, void* dst); void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU. // to CPU.
void GetOutputInCPU(const std::string& name, void* dst); void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
// Fill an ITensor into map itensor_map_. // Fill an ITensor into map itensor_map_.
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name. // Get an ITensor called name.
......
...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu); engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu, x_v * 2 + 3);
...@@ -108,7 +108,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -108,7 +108,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1); ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0]); engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
...@@ -141,7 +141,7 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) { ...@@ -141,7 +141,7 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float* y_cpu = new float[18]; float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0]); engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0); ASSERT_EQ(y_cpu[1], 6.0);
} }
......
...@@ -100,8 +100,11 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -100,8 +100,11 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// tensor. // tensor.
// if (platform::is_cpu_place(fluid_t->place())) { // if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU( auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
y, fluid_t->mutable_data<float>(platform::CPUPlace())); FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(y,
fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float));
//} else { //} else {
// engine->GetOutputInGPU( // engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()), // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册