未验证 提交 51ecd933 编写于 作者: Y Yuanle Liu 提交者: GitHub

[Inference] clean unused code/target for reduce inference so volume (PART I) (#53762)

* remove prelu land ookuip_table plugin, adjust .h include location

* clean code and adjust some .h

* update
上级 db407bf0
...@@ -107,11 +107,7 @@ void AttentionLSTMFusePass::FindWhileOp(Graph* graph) const { ...@@ -107,11 +107,7 @@ void AttentionLSTMFusePass::FindWhileOp(Graph* graph) const {
gpd.mutable_pattern()->NewNode( gpd.mutable_pattern()->NewNode(
[&](Node* n) { return fused_external_ops.count(n->id()); }, "while"); [&](Node* n) { return fused_external_ops.count(n->id()); }, "while");
if (!graph->Has(kGraphvizMarkedNodeAttr)) { auto& marked_nodes = GetMarkedNodes(graph);
graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
}
auto& marked_nodes =
graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include <fstream>
#include <ostream>
#include <string> #include <string>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
......
...@@ -14,12 +14,7 @@ limitations under the License. */ ...@@ -14,12 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <unordered_set> #include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/passes/passes.h" #include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/pretty_log.h"
......
...@@ -35,12 +35,10 @@ limitations under the License. */ ...@@ -35,12 +35,10 @@ limitations under the License. */
* phase in the inference service. * phase in the inference service.
*/ */
#include <string>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace inference {
namespace analysis {
enum class Device { CPU, GPU };
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include "paddle/fluid/inference/analysis/dot.h"
namespace paddle {
namespace inference {
namespace analysis {
class DotTester : public ::testing::Test {
protected:
void SetUp() override {
std::vector<Dot::Attr> attrs({{"title", "hello"}});
dot.reset(new Dot(attrs));
dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")});
dot->AddNode("b", {});
dot->AddNode("c", {});
dot->AddEdge("a", "b", {});
dot->AddEdge("b", "c", {});
dot->AddEdge("a", "c", {});
}
std::unique_ptr<Dot> dot;
};
TEST_F(DotTester, Build) {
auto codes = dot->Build();
// Output the DOT language code, the generated codes are too long to compare
// the string.
//
// The output is
//
// digraph G {
// title="hello"
// node_1
// node_2
// node_0[label="a" shape="box" color="blue"]
// node_0->node_1
// node_1->node_2
// node_0->node_2
// } // end G
LOG(INFO) << '\n' << codes;
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "gflags/gflags.h"
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/ir/subgraph_detector.h" #include "paddle/fluid/framework/ir/subgraph_detector.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
namespace paddle { namespace paddle {
......
...@@ -14,16 +14,9 @@ ...@@ -14,16 +14,9 @@
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
#endif
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
......
...@@ -33,10 +33,6 @@ class IrAnalysisPass : public AnalysisPass { ...@@ -33,10 +33,6 @@ class IrAnalysisPass : public AnalysisPass {
void CollectFusionStatis(Argument* argument); void CollectFusionStatis(Argument* argument);
void ReadCalibrationInfo(
Argument* argument,
std::unordered_map<std::string, std::vector<float>>* var_quant_scales);
std::string repr() const override; std::string repr() const override;
}; };
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
......
...@@ -17,15 +17,15 @@ ...@@ -17,15 +17,15 @@
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/dense_tensor.h"
DEFINE_bool( DEFINE_bool(
custom_model_save_cpu, custom_model_save_cpu,
......
...@@ -15,12 +15,9 @@ ...@@ -15,12 +15,9 @@
#pragma once #pragma once
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -15,7 +15,9 @@ ...@@ -15,7 +15,9 @@
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include <string> #include <string>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
......
...@@ -15,12 +15,9 @@ ...@@ -15,12 +15,9 @@
#pragma once #pragma once
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/phi/backends/dynload/port.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(APPLE) if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif() endif()
......
# Embed Paddle Inference in Your Application
Paddle inference offers the APIs in `C` and `C++` languages.
You can easily deploy a model trained by Paddle following the steps as below:
1. Optimize the native model;
2. Write some codes for deployment.
## The APIs
All the released APIs are located in the `paddle_inference_api.h` header file.
The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`.
## Write some codes
Read `paddle_inference_api.h` for more information.
# Inference High-level APIs
This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_inference.so` and `libpaddle_inference_io.so` are needed for a deployment.
## PaddleTensor
We provide the `PaddleTensor` data structure to give a general tensor interface.
The definition is
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type.
The `name` field is used to specify the name of an input variable,
that is important when there are multiple inputs and need to distinguish which variable to set.
## engine
The inference APIs has two different underlying engines
- the native engine
- the tensorrt engine
The native engine, which is consists of the native operators and framework, takes a native Paddle model
as input, and supports any model that trained by Paddle.
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
};
```
## PaddlePredictor and how to create one
The main interface is `PaddlePredictor,` there are following methods
- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
- take inputs and output `output_data.`
- `Clone` to clone a predictor from an existing one, with model parameter shared.
There is a factory method to help create a predictor, and the user takes the ownership of this object.
```c++
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
```
By specifying the engine kind and config, one can get a specific implementation.
## Reference
- [paddle_inference_api.h](./paddle_inference_api.h)
- [some demos](./demo_ci)
# Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
预测库包含:
- 头文件 `paddle_inference_api.h` 定义了所有的接口
- 库文件 `libpaddle_inference.so/.a(Linux/Mac)` `libpaddle_inference.lib/paddle_inference.dll(Windows)`
下面是详细的一些 API 概念介绍
## PaddleTensor
PaddleTensor 定义了预测最基本的输入输出的数据格式,其定义是
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
- `name` 用于指定输入数据对应的 模型中variable 的名字 (暂时没有用,但会在后续支持任意 target 时启用)
- `shape` 表示一个 Tensor 的 shape
- `data` 数据以连续内存的方式存储在`PaddleBuf` 中,`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存,详细可以参考头文件中相关定义。
- `dtype` 表示 Tensor 的数据类型
## engine
高层 API 底层有多种优化实现,我们称之为 engine,目前有两种 engine
- 原生 engine,由 paddle 原生的 forward operator 组成,可以天然支持所有paddle 训练出的模型,
- TensorRT mixed engine,用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ,支持所有paddle 模型,并自动切割部分计算子图到 TensorRT 上加速(WIP)
其实现为
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
};
```
## 预测部署过程
总体上分为以下步骤
1. 用合适的配置创建 `PaddlePredictor`
2. 创建输入用的 `PaddleTensor`,传入到 `PaddlePredictor`
3. 获取输出的 `PaddleTensor` ,将结果取出
下面完整演示一个简单的模型,部分细节代码隐去
```c++
#include "paddle_inference_api.h"
// 创建一个 config,并修改相关设置
paddle::NativeConfig config;
config.model_dir = "xxx";
config.use_gpu = false;
// 创建一个原生的 PaddlePredictor
auto predictor =
paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
// 创建输入 tensor
int64_t data[4] = {1, 2, 3, 4};
paddle::PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = paddle::PaddleBuf(data, sizeof(data)),
.dtype = paddle::PaddleDType::INT64};
// 创建输出 tensor,输出 tensor 的内存可以复用
std::vector<paddle::PaddleTensor> outputs;
// 执行预测
CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ...
```
编译时,联编 `libpaddle_inference.a/.so(Linux/Mac)``libpaddle_inference.lib/paddle_inference.dll(Windows)` 便可。
## 详细代码参考
- [inference demos](./demo_ci)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
namespace paddle {
namespace inference {
struct Buffer;
enum class DeviceType { UNK = -1, CPU, GPU };
/*
* EngineBase is the base class of all inference engines. An inference engine
* takes a paddle program as input, and outputs the result in fluid Tensor
* format. It can be used to optimize performance of computation sub-blocks, for
* example, break down the original block into sub-blocks and execute each
* sub-blocks in different engines.
*
* For example:
* When inference, the resnet50 model can put most of the model into subgraph
* and run it on a TensorRT engine.
*
* There are several engines such as TensorRT and other frameworks, so an
* EngineBase is put forward to give an unified interface for all the
* different engine implemention.
*/
class EngineBase {
public:
using DescType = ::paddle::framework::proto::BlockDesc;
// Build the model and do some preparation, for example, in TensorRT, run
// createInferBuilder, buildCudaEngine.
virtual void Build(const DescType& paddle_model) = 0;
// Execute the engine, that will run the inference network.
virtual void Execute(int batch_size) = 0;
virtual ~EngineBase() {}
}; // class EngineBase
struct Buffer {
void* buffer{nullptr}; // buffer should be allocated only once.
size_t max_size; // buffer allocated space.
size_t size; // data size.
DeviceType device{DeviceType::UNK}; // tells which device this buffer is on.
};
} // namespace inference
} // namespace paddle
...@@ -99,7 +99,6 @@ list( ...@@ -99,7 +99,6 @@ list(
merge_layernorm_op.cc merge_layernorm_op.cc
skip_merge_layernorm_op.cc skip_merge_layernorm_op.cc
generic_and_custom_plugin_creater.cc generic_and_custom_plugin_creater.cc
fused_lookup_tables_op.cc
lookup_table_op.cc lookup_table_op.cc
elementwiseadd_transpose_op.cc elementwiseadd_transpose_op.cc
skip_groupnorm_act_op.cc skip_groupnorm_act_op.cc
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/utils.h"
#include "paddle/fluid/inference/tensorrt/plugin/lookup_table.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class FusedLookupTablesOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
if (!engine_->with_dynamic_shape()) {
PADDLE_THROW(
platform::errors::Fatal("lookup_table_op must with dynamic shape"));
}
framework::OpDesc op_desc(op, nullptr);
auto ids_name = op_desc.Input("Ids").front();
auto w_name = op_desc.Input("W").front();
auto output_name = op_desc.Output("Out").front();
bool enable_int8 = op_desc.HasAttr("enable_int8");
std::vector<nvinfer1::ITensor*> plugin_inputs;
auto ids_dims = engine_->GetITensor(ids_name)->getDimensions();
if (ids_dims.d[ids_dims.nbDims - 1] == 1) {
nvinfer1::Dims new_ids_dims;
new_ids_dims.nbDims = ids_dims.nbDims - 1;
for (int i = 0; i < ids_dims.nbDims - 1; i++) {
new_ids_dims.d[i] = 0;
}
auto* reshape_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *(engine_->GetITensor(ids_name)));
reshape_layer->setReshapeDimensions(new_ids_dims);
reshape_layer->setName(
("lookup_table: Shuffle (Output: " + output_name + ")").c_str());
plugin_inputs.push_back(reshape_layer->getOutput(0));
} else {
plugin_inputs.push_back(engine_->GetITensor(ids_name));
}
TensorRTEngine::Weight weight;
auto* w_var = scope.FindVar(w_name);
auto* w_tensor = w_var->GetMutable<phi::DenseTensor>();
auto w_dims = w_tensor->dims();
weight = engine_->GetTrtWeight(w_name, *w_tensor);
auto weight_size = phi::product(w_dims);
int output_fp16;
if (engine_->precision() == AnalysisConfig::Precision::kFloat32) {
output_fp16 = 0;
} else {
output_fp16 = 1;
}
int32_t weight_width = static_cast<int32_t>(w_dims[1]);
std::vector<nvinfer1::PluginField> fields;
fields.emplace_back("lookup_table_weight",
weight.get().values,
GetPluginFieldType(weight.get().type),
static_cast<int32_t>(weight_size));
fields.emplace_back("lookup_table_weight_width",
&weight_width,
nvinfer1::PluginFieldType::kINT32,
1);
fields.emplace_back(
"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1);
nvinfer1::PluginFieldCollection* plugin_ptr =
static_cast<nvinfer1::PluginFieldCollection*>(
malloc(sizeof(*plugin_ptr) +
fields.size() * sizeof(nvinfer1::PluginField)));
plugin_ptr->nbFields = static_cast<int>(fields.size());
plugin_ptr->fields = fields.data();
auto creator =
GetPluginRegistry()->getPluginCreator("LookupTablePluginDynamic", "1");
auto plugin_obj =
creator->createPlugin("LookupTablePluginDynamic", plugin_ptr);
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
plugin_layer->setName(
("lookup_table: (Output: " + output_name + ")").c_str());
engine_->SetITensor(output_name, plugin_layer->getOutput(0));
free(plugin_ptr);
if (enable_int8) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0), out_scale);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// NOTE(liuyuanle): We will remove the implementation here later. Ref to
// tensorrt/convert/lookup_table_op.cc.
// REGISTER_TRT_OP_CONVERTER(lookup_table, FusedLookupTablesOpConverter);
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -31,7 +31,6 @@ limitations under the License. */ ...@@ -31,7 +31,6 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
......
...@@ -4,7 +4,6 @@ list( ...@@ -4,7 +4,6 @@ list(
trt_plugin.cc trt_plugin.cc
split_op_plugin.cu split_op_plugin.cu
elementwise_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu
gelu_op_plugin.cu gelu_op_plugin.cu
pool_op_plugin.cu pool_op_plugin.cu
swish_op_plugin.cu swish_op_plugin.cu
...@@ -40,7 +39,6 @@ list( ...@@ -40,7 +39,6 @@ list(
preln_groupnorm_act_op_plugin.cu preln_groupnorm_act_op_plugin.cu
elementwiseadd_transpose_op_plugin.cu elementwiseadd_transpose_op_plugin.cu
generic_plugin.cu generic_plugin.cu
lookup_table.cu
many_emb_layernorm_plugin.cu many_emb_layernorm_plugin.cu
many_emb_layernorm_kernel.cu) many_emb_layernorm_kernel.cu)
...@@ -57,11 +55,7 @@ endif() ...@@ -57,11 +55,7 @@ endif()
nv_library( nv_library(
tensorrt_plugin tensorrt_plugin
SRCS ${TRT_FILES} SRCS ${TRT_FILES}
DEPS enforce DEPS enforce tensorrt_engine tensor bert_encoder_functor
tensorrt_engine
prelu
tensor
bert_encoder_functor
tensorrt_dynamic_shape_infermeta_factory tensorrt_dynamic_shape_infermeta_factory
tensorrt_plugin_arg_mapping_context) tensorrt_plugin_arg_mapping_context)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/inference/tensorrt/plugin/lookup_table.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
char const* PLUGINVERSION{"1"};
char const* LOOKUPTABLEPLUGINNAME{"LookupTablePluginDynamic"};
template <typename T, unsigned TPB>
__global__ void lookup_table_kernel(int weight_height,
int32_t const* inputIds,
T const* wordEmb,
int32_t const wordSize,
T* output) {
// 1. lookup word and token of the block
// blockIdx.x = position in the sequence
// blockIdx.y = batch
// gridDim.x = S
// gridDim.y = B
__shared__ int wordId;
int32_t const seqPos = blockIdx.x + blockIdx.y * gridDim.x;
if (threadIdx.x == 0) {
wordId = inputIds[seqPos];
}
__syncthreads();
// 2. load word embeddings and add them toghether
// offset into embeddings is given by wordId * hidden_size
int32_t const woffset = wordId * weight_height;
// the output offset is given by b * (S*hidden_size) + s * hidden_size
int32_t const outOffset = seqPos * weight_height;
if (wordId >= 0 && wordId < wordSize) {
for (int it = threadIdx.x; it < weight_height; it += TPB) {
T const w(wordEmb[woffset + it]);
output[outOffset + it] = w;
}
} else {
printf(
"Error!!!!!!(LookupTablePlugin): ID cannot be lookup "
"table: ID < 0 or ID > max ");
return;
}
}
template <typename T>
int lookup_table(cudaStream_t stream,
int weight_height,
int B,
int S,
int32_t const* inputIds,
T const* wordEmb,
int32_t const wordSize,
T* output) {
constexpr int tpb = 256;
dim3 const grid(S, B, 1);
dim3 const block(tpb, 1, 1);
lookup_table_kernel<T, tpb><<<grid, block, 0, stream>>>(
weight_height, inputIds, wordEmb, wordSize, output);
return 0;
}
// Static class fields initialization
nvinfer1::PluginFieldCollection LookupTablePluginDynamicCreator::mFC{};
std::vector<nvinfer1::PluginField>
LookupTablePluginDynamicCreator::mPluginAttributes;
LookupTablePluginDynamic::LookupTablePluginDynamic(
nvinfer1::DataType const type,
void* weight_dev,
int32_t weight_size,
int32_t width)
: mType(type),
mWeightDev(weight_dev),
mWeightSize(weight_size),
mWeightWidth(width) {}
LookupTablePluginDynamic::LookupTablePluginDynamic(void const* data,
size_t length) {
// Deserialize in the same order as serialization
deserialize_value(&data, &length, &mType);
deserialize_value(&data, &length, &mWeightSize);
deserialize_value(&data, &length, &mWeightWidth);
char const* d = static_cast<char const*>(data);
cudaMalloc(&mWeightDev, mWeightSize * getElementSize(mType));
cudaMemcpy(mWeightDev,
d,
mWeightSize * getElementSize(mType),
cudaMemcpyHostToDevice);
}
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt* LookupTablePluginDynamic::clone()
const noexcept {
auto p = new LookupTablePluginDynamic(
mType, mWeightDev, mWeightSize, mWeightWidth);
p->setPluginNamespace(mNamespace.c_str());
return p;
}
nvinfer1::DimsExprs LookupTablePluginDynamic::getOutputDimensions(
int32_t outputIndex,
nvinfer1::DimsExprs const* inputs,
int32_t nbInputs,
nvinfer1::IExprBuilder& exprBuilder) noexcept {
nvinfer1::DimsExprs ret;
ret.nbDims = inputs[0].nbDims + 1;
for (int i = 0; i < inputs[0].nbDims; ++i) {
ret.d[i] = inputs[0].d[i];
}
ret.d[inputs[0].nbDims] = exprBuilder.constant(mWeightWidth);
return ret;
}
bool LookupTablePluginDynamic::supportsFormatCombination(
int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) noexcept {
nvinfer1::PluginTensorDesc const& desc = inOut[pos];
if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
return false;
}
if (pos == 0) {
return desc.type == nvinfer1::DataType::kINT32;
}
if (pos == 1) {
if (mType == nvinfer1::DataType::kFLOAT) {
return desc.type == nvinfer1::DataType::kFLOAT;
} else {
return desc.type == nvinfer1::DataType::kHALF;
}
}
}
void LookupTablePluginDynamic::configurePlugin(
nvinfer1::DynamicPluginTensorDesc const* inputs,
int32_t nbInputs,
nvinfer1::DynamicPluginTensorDesc const* outputs,
int32_t nbOutputs) noexcept {}
size_t LookupTablePluginDynamic::getWorkspaceSize(
nvinfer1::PluginTensorDesc const* inputs,
int32_t nbInputs,
nvinfer1::PluginTensorDesc const* outputs,
int32_t nbOutputs) const noexcept {
return 0;
}
int32_t LookupTablePluginDynamic::enqueue(
nvinfer1::PluginTensorDesc const* inputDesc,
nvinfer1::PluginTensorDesc const* outputDesc,
void const* const* inputs,
void* const* outputs,
void* workspace,
cudaStream_t stream) noexcept {
int32_t const batchSize = inputDesc->dims.d[0];
int32_t S;
if (inputDesc->dims.nbDims == 1) {
S = 1;
} else {
S = inputDesc->dims.d[1];
}
int32_t mWeightHeight = mWeightSize / mWeightWidth;
int32_t status = STATUS_FAILURE;
auto const inputIds = static_cast<int32_t const*>(inputs[0]);
if (mType == nvinfer1::DataType::kFLOAT) {
auto output = static_cast<float*>(outputs[0]);
auto const Weight = static_cast<const float*>(mWeightDev);
status = lookup_table<float>(stream,
static_cast<int32_t>(mWeightWidth),
batchSize,
S,
inputIds,
Weight,
mWeightHeight,
output);
} else if (mType == nvinfer1::DataType::kHALF) {
auto output = static_cast<half*>(outputs[0]);
auto const Weight = static_cast<const half*>(mWeightDev);
status = lookup_table<half>(stream,
static_cast<int32_t>(mWeightWidth),
batchSize,
S,
inputIds,
Weight,
mWeightHeight,
output);
}
return status;
}
// IPluginV2Ext Methods
nvinfer1::DataType LookupTablePluginDynamic::getOutputDataType(
int32_t index,
nvinfer1::DataType const* inputTypes,
int32_t nbInputs) const noexcept {
if (index == 0) {
assert(mType == nvinfer1::DataType::kHALF ||
mType == nvinfer1::DataType::kFLOAT);
return mType;
}
}
// IPluginV2 Methods
char const* LookupTablePluginDynamic::getPluginType() const noexcept {
return LOOKUPTABLEPLUGINNAME;
}
char const* LookupTablePluginDynamic::getPluginVersion() const noexcept {
return PLUGINVERSION;
}
int32_t LookupTablePluginDynamic::getNbOutputs() const noexcept { return 1; }
int32_t LookupTablePluginDynamic::initialize() noexcept { return 0; }
void LookupTablePluginDynamic::terminate() noexcept { cudaFree(mWeightDev); }
size_t LookupTablePluginDynamic::getSerializationSize() const noexcept {
size_t const wordSize = getElementSize(mType);
return sizeof(mType) //
+ sizeof(mWeightSize) //
+ sizeof(mWeightWidth) //
+ wordSize * mWeightSize; //
}
void LookupTablePluginDynamic::serialize(void* buffer) const noexcept {
serialize_value(&buffer, mType);
serialize_value(&buffer, mWeightSize);
serialize_value(&buffer, mWeightWidth);
char* d = static_cast<char*>(buffer);
size_t const wordSize = getElementSize(mType);
serFromDev(&d, static_cast<char*>(mWeightDev), mWeightSize * wordSize);
}
void LookupTablePluginDynamic::destroy() noexcept {
// This gets called when the network containing plugin is destroyed
delete this;
}
void LookupTablePluginDynamic::setPluginNamespace(
char const* libNamespace) noexcept {
mNamespace = libNamespace;
}
char const* LookupTablePluginDynamic::getPluginNamespace() const noexcept {
return mNamespace.c_str();
}
LookupTablePluginDynamicCreator::LookupTablePluginDynamicCreator() {}
char const* LookupTablePluginDynamicCreator::getPluginName() const noexcept {
return LOOKUPTABLEPLUGINNAME;
}
char const* LookupTablePluginDynamicCreator::getPluginVersion() const noexcept {
return PLUGINVERSION;
}
nvinfer1::PluginFieldCollection const*
LookupTablePluginDynamicCreator::getFieldNames() noexcept {
return &mFC;
}
bool initializeFields(nvinfer1::PluginFieldCollection const* fc,
nvinfer1::Weights* weight,
int32_t& mWeightWidth) { // NOLINT
bool output_fp16 = false;
for (int32_t i = 0; i < fc->nbFields; i++) {
std::string field_name(fc->fields[i].name);
if (field_name.compare("lookup_table_weight") == 0) {
weight->values = fc->fields[i].data;
weight->count = fc->fields[i].length;
weight->type = fieldTypeToDataType(fc->fields[i].type);
}
if (field_name.compare("lookup_table_weight_width") == 0) {
assert(fc->fields[i].type == nvinfer1::PluginFieldType::kINT32);
mWeightWidth = const_cast<int32_t*>(
static_cast<int32_t const*>(fc->fields[i].data))[0]; // NOLINT
}
if (field_name.compare("output_fp16") == 0) {
assert(fc->fields[i].type == nvinfer1::PluginFieldType::kINT32);
output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
}
}
return output_fp16;
}
nvinfer1::IPluginV2* LookupTablePluginDynamicCreator::createPlugin(
char const* name, const nvinfer1::PluginFieldCollection* fc) noexcept {
nvinfer1::Weights weight;
int32_t mWeightWidth;
bool output_fp16 = initializeFields(fc, &weight, mWeightWidth);
nvinfer1::DataType type;
if (output_fp16) {
type = nvinfer1::DataType::kHALF;
} else {
type = nvinfer1::DataType::kFLOAT;
}
WeightsWithOwnership mWeight;
mWeight.convertAndCopy(weight, type);
void* cudaMem{nullptr};
cudaMalloc(&cudaMem, getWeightsSize(mWeight, type));
cudaMemcpy(cudaMem,
mWeight.values,
getWeightsSize(mWeight, type),
cudaMemcpyHostToDevice);
LookupTablePluginDynamic* p =
new LookupTablePluginDynamic(type, cudaMem, mWeight.count, mWeightWidth);
return p;
}
nvinfer1::IPluginV2* LookupTablePluginDynamicCreator::deserializePlugin(
char const* name, void const* serialData, size_t serialLength) noexcept {
return new LookupTablePluginDynamic(serialData, serialLength);
}
void LookupTablePluginDynamicCreator::setPluginNamespace(
char const* libNamespace) noexcept {
mNamespace = libNamespace;
}
char const* LookupTablePluginDynamicCreator::getPluginNamespace()
const noexcept {
return mNamespace.c_str();
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
#include "NvInferPlugin.h"
#include "NvInferRuntime.h"
#include "paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h"
#include "paddle/fluid/inference/tensorrt/plugin/common/serialize.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class LookupTablePluginDynamic : public nvinfer1::IPluginV2DynamicExt {
public:
LookupTablePluginDynamic(nvinfer1::DataType const type,
void* weight_dev,
int32_t weight_size,
int32_t width);
LookupTablePluginDynamic(void const* data, size_t length);
// It doesn't make sense to make EmbLayerNormVarSeqlenPlugin without
// arguments, so we delete default constructor.
LookupTablePluginDynamic() = delete;
// IPluginV2DynamicExt Methods
bool supportsFormatCombination(int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) noexcept override;
size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs,
int32_t nbInputs,
nvinfer1::PluginTensorDesc const* outputs,
int32_t nbOutputs) const noexcept override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(
int32_t index,
nvinfer1::DataType const* inputTypes,
int32_t nbInputs) const noexcept override;
// IPluginV2 Methods
nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
nvinfer1::DimsExprs getOutputDimensions(
int32_t outputIndex,
const nvinfer1::DimsExprs* inputs,
int32_t nbInputs,
nvinfer1::IExprBuilder& exprBuilder) noexcept override;
void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in,
int32_t nbInputs,
nvinfer1::DynamicPluginTensorDesc const* out,
int32_t nbOutputs) noexcept override;
char const* getPluginType() const noexcept override;
int32_t getNbOutputs() const noexcept override;
size_t getSerializationSize() const noexcept override;
void serialize(void* buffer) const noexcept override;
void destroy() noexcept override;
char const* getPluginNamespace() const noexcept override;
void setPluginNamespace(char const* pluginNamespace) noexcept override;
int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
nvinfer1::PluginTensorDesc const* outputDesc,
void const* const* inputs,
void* const* outputs,
void* workspace,
cudaStream_t stream) noexcept override;
int32_t initialize() noexcept override;
void terminate() noexcept override;
char const* getPluginVersion() const noexcept override;
protected:
std::string mNamespace;
nvinfer1::DataType mType;
void* mWeightDev{nullptr};
int32_t mWeightSize;
int32_t mWeightWidth;
};
class LookupTablePluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
LookupTablePluginDynamicCreator();
char const* getPluginName() const noexcept override;
const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
void setPluginNamespace(char const* pluginNamespace) noexcept override;
char const* getPluginNamespace() const noexcept override;
nvinfer1::IPluginV2* createPlugin(
char const* name,
const nvinfer1::PluginFieldCollection* fc) noexcept override;
char const* getPluginVersion() const noexcept override;
nvinfer1::IPluginV2* deserializePlugin(char const* name,
void const* serialData,
size_t serialLength) noexcept override;
protected:
static nvinfer1::PluginFieldCollection mFC;
static std::vector<nvinfer1::PluginField> mPluginAttributes;
std::string mNamespace;
};
REGISTER_TRT_PLUGIN_V2(LookupTablePluginDynamicCreator);
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
int PReluPlugin::initialize() TRT_NOEXCEPT {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_,
weight_.data(),
weight_.size() * sizeof(float),
cudaMemcpyHostToDevice);
return 0;
}
void PReluPlugin::terminate() TRT_NOEXCEPT {
if (p_gpu_weight_) {
cudaFree(p_gpu_weight_);
p_gpu_weight_ = nullptr;
}
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims,
int nbInputs) TRT_NOEXCEPT {
assert(nbInputs == 1);
assert(index < this->getNbOutputs());
nvinfer1::Dims const &input_dims = inputDims[0];
nvinfer1::Dims output_dims = input_dims;
return output_dims;
}
int PReluPlugin::enqueue(int batch_size,
const void *const *inputs,
#if IS_TRT_VERSION_LT(8000)
void **outputs,
void *workspace,
cudaStream_t stream) {
#else
void *const *outputs,
void *workspace,
cudaStream_t stream) TRT_NOEXCEPT {
#endif
// input dims is CHW.
const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]);
// const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
const float *alpha = p_gpu_weight_;
float *const output = reinterpret_cast<float *const *>(outputs)[0];
int numel = 1;
for (int i = 0; i < input_dims.nbDims; i++) {
numel *= input_dims.d[i];
}
if (mode_ == "channel") {
bool channel_last = data_format_ == "NHWC";
operators::math::PreluChannelWiseDirectCUDAFunctor<float>
prelu_channel_wise;
prelu_channel_wise(stream,
input,
alpha,
output,
input_dims.d[0],
input_dims.d[1],
channel_last,
numel);
} else if (mode_ == "element") {
operators::math::PreluElementWiseDirectCUDAFunctor<float>
prelu_element_wise;
prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
} else {
operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
prelu_scalar(stream, input, alpha, output, numel);
}
return cudaGetLastError() != cudaSuccess;
}
#if IS_TRT_VERSION_GE(6000)
void PReluPluginDynamic::terminate() TRT_NOEXCEPT {
if (p_gpu_weight_) {
cudaFree(p_gpu_weight_);
}
}
int PReluPluginDynamic::initialize() TRT_NOEXCEPT {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_,
weight_.data(),
weight_.size() * sizeof(float),
cudaMemcpyHostToDevice);
return 0;
}
PReluPluginDynamic::PReluPluginDynamic(void const *serialData,
size_t serialLength) {
DeserializeValue(&serialData, &serialLength, &weight_);
const char *prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode);
}
size_t PReluPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
return SerializedSize(mode_.c_str()) + SerializedSize(weight_);
}
void PReluPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
SerializeValue(&buffer, weight_);
SerializeValue(&buffer, mode_.c_str());
}
nvinfer1::DimsExprs PReluPluginDynamic::getOutputDimensions(
int output_index,
const nvinfer1::DimsExprs *inputs,
int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
return inputs[0];
}
bool PReluPluginDynamic::supportsFormatCombination(
int pos,
const nvinfer1::PluginTensorDesc *in_out,
int nb_inputs,
int nb_outputs) TRT_NOEXCEPT {
PADDLE_ENFORCE_NOT_NULL(
in_out,
platform::errors::InvalidArgument(
"The input of swish plugin shoule not be nullptr."));
PADDLE_ENFORCE_LT(
pos,
nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos,
nb_inputs + nb_outputs));
(in_out && pos < (nb_inputs + nb_outputs));
return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
in_out[pos].format == nvinfer1::PluginFormat::kLINEAR);
}
nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
int index,
const nvinfer1::DataType *input_types,
int nb_inputs) const TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The PRelu Plugin only has one input, so the "
"index value should be 0, but get %d.",
index));
PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT),
true,
platform::errors::InvalidArgument(
"The input type should be half or float"));
return input_types[0];
}
int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc,
const void *const *inputs,
void *const *outputs,
void *workspace,
cudaStream_t stream) TRT_NOEXCEPT {
auto input_dims = input_desc[0].dims;
const float *alpha = p_gpu_weight_;
const float *input = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
int numel = 1;
for (int i = 0; i < input_dims.nbDims; i++) {
numel *= input_dims.d[i];
}
if (mode_ == "channel") {
bool channel_last = data_format_ == "NHWC";
operators::math::PreluChannelWiseDirectCUDAFunctor<float>
prelu_channel_wise;
prelu_channel_wise(stream,
input,
alpha,
output,
input_dims.d[0],
input_dims.d[1],
channel_last,
numel);
} else if (mode_ == "element") {
operators::math::PreluElementWiseDirectCUDAFunctor<float>
prelu_element_wise;
prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
} else {
operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
prelu_scalar(stream, input, alpha, output, numel);
}
return cudaGetLastError() != cudaSuccess;
}
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class PReluPlugin : public PluginTensorRT {
std::vector<float> weight_;
float* p_gpu_weight_;
std::string mode_;
std::string data_format_;
public:
size_t getSerializationSize() const TRT_NOEXCEPT override {
return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
SerializedSize(data_format_.c_str()) + SerializedSize(weight_);
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void serialize(void* buffer) const TRT_NOEXCEPT override {
serializeBase(buffer);
SerializeValue(&buffer, weight_);
SerializeValue(&buffer, mode_.c_str());
SerializeValue(&buffer, data_format_.c_str());
}
PReluPlugin(const float* weight,
const int weight_num,
std::string const& mode,
std::string const& data_format)
: mode_(mode), data_format_(data_format) {
weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data());
}
// It was used for tensorrt deserialization.
// It should not be called by users.
PReluPlugin(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &weight_);
const char* prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode);
const char* prelu_data_format;
DeserializeValue(&serialData, &serialLength, &prelu_data_format);
data_format_ = std::string(prelu_data_format);
}
~PReluPlugin() {}
int initialize() TRT_NOEXCEPT override;
void terminate() TRT_NOEXCEPT override;
PReluPlugin* clone() const TRT_NOEXCEPT override {
auto* ptr =
new PReluPlugin(weight_.data(), weight_.size(), mode_, data_format_);
ptr->p_gpu_weight_ = p_gpu_weight_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "prelu_plugin";
}
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* inputs,
int nbInputDims) TRT_NOEXCEPT override;
#if IS_TRT_VERSION_LT(8000)
int enqueue(int batchSize,
const void* const* inputs,
void** outputs,
#else
int enqueue(int batchSize,
const void* const* inputs,
void* const* outputs,
#endif
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
};
class PReluPluginCreator : public TensorRTPluginCreator {
public:
const char* getPluginName() const TRT_NOEXCEPT override {
return "prelu_plugin";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length)
TRT_NOEXCEPT override {
return new PReluPlugin(serial_data, serial_length);
}
};
REGISTER_TRT_PLUGIN_V2(PReluPluginCreator);
#if IS_TRT_VERSION_GE(6000)
class PReluPluginDynamic : public DynamicPluginTensorRT {
public:
PReluPluginDynamic(const float* weight,
const int weight_num,
std::string const& mode,
std::string const& data_format)
: mode_(mode), data_format_(data_format) {
weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data());
}
PReluPluginDynamic(void const* serialData, size_t serialLength);
~PReluPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
auto ptr = new PReluPluginDynamic(
weight_.data(), weight_.size(), mode_, data_format_);
ptr->p_gpu_weight_ = p_gpu_weight_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "prelu_plugin_dynamic";
}
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
int initialize() TRT_NOEXCEPT override;
void terminate() TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void* buffer) const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int output_index,
const nvinfer1::DimsExprs* inputs,
int nb_inputs,
nvinfer1::IExprBuilder& expr_builder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) TRT_NOEXCEPT override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs,
void* const* outputs,
void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const
TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override { delete this; }
private:
std::vector<float> weight_;
float* p_gpu_weight_;
std::string mode_;
std::string data_format_;
};
#endif
class PReluPluginDynamicCreator : public TensorRTPluginCreator {
public:
const char* getPluginName() const TRT_NOEXCEPT override {
return "prelu_plugin_dynamic";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length)
TRT_NOEXCEPT override {
return new PReluPluginDynamic(serial_data, serial_length);
}
};
REGISTER_TRT_PLUGIN_V2(PReluPluginDynamicCreator);
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册