提交 89dcb0bd 编写于 作者: L Luo Tao

refine EngineIOConverter, and use io_convert in test_trt_activation_op

上级 9a98a572
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_trt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc nv_test(test_trt_activation_op SRCS test_activation_op.cc io_converter.cc ${ENGINE_FILE} activation_op.cc
DEPS ${FLUID_CORE_MODULES} activation_op) DEPS ${FLUID_CORE_MODULES} activation_op)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
...@@ -23,26 +23,42 @@ namespace tensorrt { ...@@ -23,26 +23,42 @@ namespace tensorrt {
using platform::is_gpu_place; using platform::is_gpu_place;
using platform::is_cpu_place; using platform::is_cpu_place;
class DefaultInputConverter : public EngineInputConverter { class DefaultIOConverter : public EngineIOConverter {
public: public:
DefaultInputConverter() {} DefaultIOConverter() {}
// NOTE out is GPU memory. // NOTE out is GPU memory.
virtual void operator()(const LoDTensor& in, void* out, virtual void operator()(const LoDTensor& in, void* out,
size_t max_size) override { size_t max_size) override {
PADDLE_ENFORCE(out != nullptr); PADDLE_ENFORCE(out != nullptr);
PADDLE_ENFORCE_LE(in.memory_size(), max_size); PADDLE_ENFORCE(stream_ != nullptr);
const auto& place = in.place(); const auto& place = in.place();
size_t size = in.memory_size();
PADDLE_ENFORCE_LE(size, max_size);
if (is_cpu_place(place)) { if (is_cpu_place(place)) {
PADDLE_ENFORCE(stream_ != nullptr); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
PADDLE_ENFORCE_EQ(0, cudaMemcpyHostToDevice, *stream_));
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToDevice, *stream_));
} else if (is_gpu_place(place)) { } else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(), cudaMemcpyHostToHost, *stream_));
cudaMemcpyHostToHost, *stream_)); } else {
PADDLE_THROW("Unknown device for converter");
}
cudaStreamSynchronize(*stream_);
}
// NOTE in is GPU memory.
virtual void operator()(const void* in, LoDTensor* out,
size_t max_size) override {
PADDLE_ENFORCE(in != nullptr);
PADDLE_ENFORCE(stream_ != nullptr);
const auto& place = out->place();
size_t size = out->memory_size();
PADDLE_ENFORCE_LE(size, max_size);
if (is_cpu_place(place)) {
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
cudaMemcpyDeviceToHost, *stream_));
} else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
cudaMemcpyHostToHost, *stream_));
} else { } else {
PADDLE_THROW("Unknown device for converter"); PADDLE_THROW("Unknown device for converter");
} }
...@@ -50,7 +66,7 @@ class DefaultInputConverter : public EngineInputConverter { ...@@ -50,7 +66,7 @@ class DefaultInputConverter : public EngineInputConverter {
} }
}; };
REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter); REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -25,43 +26,57 @@ namespace tensorrt { ...@@ -25,43 +26,57 @@ namespace tensorrt {
using framework::LoDTensor; using framework::LoDTensor;
/* /*
* Convert Input from Fluid to an Engine. * Convert Input from Fluid to TensorRT Engine.
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in * Convert Output from TensorRT Engine to Fluid.
* most cases just need to copy the data. *
* Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
* major,
* so in the default case just need to copy the data.
*/ */
class EngineInputConverter { class EngineIOConverter {
public: public:
EngineInputConverter() {} EngineIOConverter() {}
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
void SetStream(cudaStream_t* stream) { stream_ = stream; } void SetStream(cudaStream_t* stream) { stream_ = stream; }
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out, static void ConvertInput(const std::string& op_type, const LoDTensor& in,
size_t max_size, cudaStream_t* stream) { void* out, size_t max_size, cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr); PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineInputConverter>::Lookup( auto* converter = Registry<EngineIOConverter>::Lookup(
in_op_type, "default" /* default_type */); op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter); PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream); converter->SetStream(stream);
(*converter)(in, out, max_size); (*converter)(in, out, max_size);
} }
virtual ~EngineInputConverter() {} static void ConvertOutput(const std::string& op_type, const void* in,
LoDTensor* out, size_t max_size,
cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineIOConverter>::Lookup(
op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream);
(*converter)(in, out, max_size);
}
virtual ~EngineIOConverter() {}
protected: protected:
cudaStream_t* stream_{nullptr}; cudaStream_t* stream_{nullptr};
}; };
#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \
struct trt_io_##op_type__##_converter { \
trt_io_##op_type__##_converter() { \
Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
} \
}; \
trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
struct trt_input_##in_op_type__##_converter { \
trt_input_##in_op_type__##_converter() { \
::paddle::inference::Registry<EngineInputConverter>::Register< \
Converter__>(#in_op_type__); \
} \
}; \
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -26,7 +27,7 @@ namespace paddle { ...@@ -26,7 +27,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void Compare(float input, float expect) { void Compare(const std::string op_type, float input, float expect) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
...@@ -35,6 +36,7 @@ void Compare(float input, float expect) { ...@@ -35,6 +36,7 @@ void Compare(float input, float expect) {
auto x_var = scope.Var("X"); auto x_var = scope.Var("X");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>(); auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({1, 1}); x_tensor->Resize({1, 1});
x_tensor->mutable_data<float>(place);
std::vector<float> init; std::vector<float> init;
init.push_back(input); init.push_back(input);
framework::TensorFromVector(init, ctx, x_tensor); framework::TensorFromVector(init, ctx, x_tensor);
...@@ -45,14 +47,15 @@ void Compare(float input, float expect) { ...@@ -45,14 +47,15 @@ void Compare(float input, float expect) {
out_tensor->mutable_data<float>(place); out_tensor->mutable_data<float>(place);
framework::OpDesc op_desc; framework::OpDesc op_desc;
op_desc.SetType("relu"); op_desc.SetType(op_type);
op_desc.SetInput("X", {"X"}); op_desc.SetInput("X", {"X"});
op_desc.SetOutput("Out", {"Out"}); op_desc.SetOutput("Out", {"Out"});
auto relu_op = framework::OpRegistry::CreateOp(op_desc); auto op = framework::OpRegistry::CreateOp(op_desc);
// run fluid op // run fluid op
relu_op->Run(scope, place); op->Run(scope, place);
// get fluid output
std::vector<float> out1; std::vector<float> out1;
framework::TensorToVector(*out_tensor, ctx, &out1); framework::TensorToVector(*out_tensor, ctx, &out1);
...@@ -63,21 +66,27 @@ void Compare(float input, float expect) { ...@@ -63,21 +66,27 @@ void Compare(float input, float expect) {
engine->InitNetwork(); engine->InitNetwork();
engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1}); nvinfer1::DimsCHW{1, 1, 1});
// convert op
OpConverter op_converter; OpConverter op_converter;
op_converter.ConvertOp(op_desc, engine); op_converter.ConvertOp(op_desc, engine);
engine->DeclareOutput("Out"); engine->DeclareOutput("Out");
engine->FreezeNetwork(); engine->FreezeNetwork();
engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
// run tensorrt op // convert LoDTensor to ITensor
size_t size = x_tensor->memory_size();
EngineIOConverter::ConvertInput(op_type, *x_tensor, engine->buffer("X"), size,
&stream);
// run tensorrt Outp
engine->Execute(1); engine->Execute(1);
// convert ITensor to LoDTensor
float out2; EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out"), out_tensor,
engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float)); size, &stream);
// get tensorrt output
ASSERT_EQ(out1[0], out2); std::vector<float> out2;
framework::TensorToVector(*out_tensor, ctx, &out2);
// compare
ASSERT_EQ(out1[0], out2[0]);
ASSERT_EQ(out1[0], expect); ASSERT_EQ(out1[0], expect);
delete engine; delete engine;
...@@ -85,8 +94,8 @@ void Compare(float input, float expect) { ...@@ -85,8 +94,8 @@ void Compare(float input, float expect) {
} }
TEST(OpConverter, ConvertRelu) { TEST(OpConverter, ConvertRelu) {
Compare(1, 1); // relu(1) = 1 Compare("relu", 1, 1); // relu(1) = 1
Compare(-5, 0); // relu(-5) = 0 Compare("relu", -5, 0); // relu(-5) = 0
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
class EngineInputConverterTester : public ::testing::Test { void IOConverterTester(const platform::DeviceContext& ctx) {
public: cudaStream_t stream;
void SetUp() override { tensor.Resize({10, 10}); } ASSERT_EQ(0, cudaStreamCreate(&stream));
framework::LoDTensor tensor; // init fluid in_tensor
}; framework::LoDTensor in_tensor;
in_tensor.Resize({10, 10});
auto place = ctx.GetPlace();
in_tensor.mutable_data<float>(place);
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(i);
}
framework::TensorFromVector(init, ctx, &in_tensor);
TEST_F(EngineInputConverterTester, DefaultCPU) { // init tensorrt buffer
void* buffer; void* buffer;
tensor.mutable_data<float>(platform::CPUPlace()); size_t size = in_tensor.memory_size();
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); ASSERT_EQ(cudaMalloc(&buffer, size), 0);
cudaStream_t stream; // convert fluid in_tensor to tensorrt buffer
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
&stream);
// convert tensorrt buffer to fluid out_tensor
framework::LoDTensor out_tensor;
out_tensor.Resize({10, 10});
out_tensor.mutable_data<float>(place);
EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
// compare in_tensor and out_tensor
std::vector<float> result;
framework::TensorToVector(out_tensor, ctx, &result);
EXPECT_EQ(init.size(), result.size());
for (size_t i = 0; i < init.size(); i++) {
EXPECT_EQ(init[i], result[i]);
}
cudaStreamDestroy(stream);
} }
TEST_F(EngineInputConverterTester, DefaultGPU) { TEST(EngineIOConverterTester, DefaultCPU) {
void* buffer; platform::CPUPlace place;
tensor.mutable_data<float>(platform::CUDAPlace()); platform::CPUDeviceContext ctx(place);
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); IOConverterTester(ctx);
}
cudaStream_t stream; TEST(EngineIOConverterTester, DefaultGPU) {
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), platform::CUDAPlace place;
&stream); platform::CUDADeviceContext ctx(place);
IOConverterTester(ctx);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -138,7 +138,6 @@ void*& TensorRTEngine::buffer(const std::string& name) { ...@@ -138,7 +138,6 @@ void*& TensorRTEngine::buffer(const std::string& name) {
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data, void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
size_t size) { size_t size) {
void* buf = buffer(name); void* buf = buffer(name);
cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_)); 0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册