未验证 提交 fa06d9c3 编写于 作者: W Wangzheee 提交者: GitHub

fix_multihead (#45429)

上级 a5e9ccda
...@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs.data(), plugin_inputs.size(), *plugin); plugin_inputs.data(), plugin_inputs.size(), *plugin);
layer = plugin_layer; layer = plugin_layer;
} }
} } else {
if (input_dims.d[1] <= 384 && !bias_qk_attr && if (input_dims.d[1] <= 384 && !bias_qk_attr &&
engine_->precision() != AnalysisConfig::Precision::kFloat32) { engine_->precision() != AnalysisConfig::Precision::kFloat32) {
/* /*
* input_dims.d[0]: batch(-1) * input_dims.d[0]: batch(-1)
* input_dims.d[1]: length:256 * input_dims.d[1]: length:256
* input_dims.d[2]: hidden_size:768 * input_dims.d[2]: hidden_size:768
input input
|[b,256,768] |[b,256,768]
| |
shuffle weight bias shuffle weight bias
|[b,256,768,1,1] | | |[b,256,768,1,1] | |
|_____________________|_________| |_____________________|_________|
| |
fc fc
|[b,256,2304,1,1] |[b,256,2304,1,1]
| |
shuffle mask(fake) pos max_length shuffle mask(fake) pos max_length
|[b*256,2304,1,1] | | | |[b*256,2304,1,1] | | |
| | | | | | | |
|_______________________|_________|________| |_______________________|_________|________|
| |
MHA MHA
|[b*256,768] |[b*256,768]
| |
shuffle shuffle
|[b, 256, 768] |[b, 256, 768]
| |
out out
*/ */
nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
static_cast<int32_t>(weight_t->numel())}; static_cast<int32_t>(weight_t->numel())};
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data), static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())}; static_cast<int32_t>(bias_t->numel())};
/*** transpose the weight and bias ***/ /*** transpose the weight and bias ***/
int head_size = hidden_out / head_number; int head_size = hidden_out / head_number;
// [3, head_number, head_size, hidden_in] -> [head_number, 3, // [3, head_number, head_size, hidden_in] -> [head_number, 3,
// head_size, hidden_in] // head_size, hidden_in]
auto transpose_weight_v2 = [](const float* src, auto transpose_weight_v2 = [](const float* src,
float* dst, float* dst,
int three, int three,
int head_number, int head_number,
int head_size, int head_size,
int hidden_in) { int hidden_in) {
const int HH = head_size * hidden_in; const int HH = head_size * hidden_in;
for (int i = 0; i < three; ++i) { for (int i = 0; i < three; ++i) {
for (int n = 0; n < head_number; ++n) { for (int n = 0; n < head_number; ++n) {
for (int hh = 0; hh < HH; ++hh) { for (int hh = 0; hh < HH; ++hh) {
dst[n * three * HH + i * HH + hh] = dst[n * three * HH + i * HH + hh] =
src[i * head_number * HH + n * HH + hh]; src[i * head_number * HH + n * HH + hh];
}
} }
} }
} };
}; // [3, head_number, head_size] -> [head_number, 3, head_size]
// [3, head_number, head_size] -> [head_number, 3, head_size] auto transpose_bias_v2 =
auto transpose_bias_v2 = [](const float* src, float* dst, int N, int H) {
[](const float* src, float* dst, int N, int H) { for (int i = 0; i < 3; ++i) {
for (int i = 0; i < 3; ++i) { for (int n = 0; n < N; ++n) {
for (int n = 0; n < N; ++n) { for (int h = 0; h < H; ++h) {
for (int h = 0; h < H; ++h) { dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h]; }
} }
} }
} };
}; memcpy(weight_data_tmp.data(),
memcpy(weight_data_tmp.data(), weight_data,
weight_data, weight_t->numel() * sizeof(float));
weight_t->numel() * sizeof(float)); transpose_weight_v2(weight_data_tmp.data(),
transpose_weight_v2(weight_data_tmp.data(), weight_data,
weight_data, three,
three, head_number,
head_number, head_size,
head_size, hidden_in);
hidden_in);
std::vector<float> bias_data_tmp;
std::vector<float> bias_data_tmp; bias_data_tmp.reserve(bias_t->numel());
bias_data_tmp.reserve(bias_t->numel()); memcpy(
memcpy( bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float)); transpose_bias_v2(
transpose_bias_v2( bias_data_tmp.data(), bias_data, head_number, head_size);
bias_data_tmp.data(), bias_data, head_number, head_size);
// add shuffle for FullyConnected layer
// add shuffle for FullyConnected layer std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor; nvinfer1::ITensor* input_shape_tensor = Shape(input);
nvinfer1::ITensor* input_shape_tensor = Shape(input); for (int i = 0; i < 5; i++) {
for (int i = 0; i < 5; i++) { reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); }
} for (int i = 0; i < 3; i++) {
for (int i = 0; i < 3; i++) { reshape_before_fc_shape_tensor[i] =
reshape_before_fc_shape_tensor[i] = GetEleTensorOfShape(input_shape_tensor, i);
GetEleTensorOfShape(input_shape_tensor, i); }
} auto* reshape_before_fc_layer =
auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); reshape_before_fc_layer->setInput(
reshape_before_fc_layer->setInput( 1, *Concat(reshape_before_fc_shape_tensor));
1, *Concat(reshape_before_fc_shape_tensor)); reshape_before_fc_layer->setName(
reshape_before_fc_layer->setName( ("shuffle_before_fc_multihead_matmul(Output: " + output_name +
("shuffle_before_fc_multihead_matmul(Output: " + output_name + ")") ")")
.c_str()); .c_str());
// add fc layer // add fc layer
nvinfer1::ILayer* fc_layer = nullptr; nvinfer1::ILayer* fc_layer = nullptr;
fc_layer = TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*reshape_before_fc_layer->getOutput(0),
n,
weight,
bias);
// add shuffle for CustomQKVToContextPluginDynamic layer
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
mha_input_tensor_shape.push_back(
Add1DConstantLayer(hidden_out * 3)); // Q,K,V
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
reshape_after_fc_layer->setName(
("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
.c_str());
// add mha_plugin
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomQKVToContextPluginDynamic", "2");
assert(creator != nullptr);
// set the attributes of mha_plugin
int type = static_cast<int>(nvinfer1::DataType::kHALF);
int var_seqlen = 1;
bool has_mask = true;
std::vector<nvinfer1::PluginField> fields{
{"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1},
{"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
{"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
{"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
{"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}};
nvinfer1::PluginFieldCollection* plugin_collection =
static_cast<nvinfer1::PluginFieldCollection*>(
malloc(sizeof(*plugin_collection) +
fields.size() *
sizeof(nvinfer1::PluginField))); // remember to free
plugin_collection->nbFields = static_cast<int>(fields.size());
plugin_collection->fields = fields.data();
auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
plugin_collection);
free(plugin_collection);
// set inputs
std::vector<nvinfer1::ITensor*> plugin_inputs;
// input_0 for plugin
plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
// input_1(fake) for plugin
std::vector<int> mask = {1};
nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
plugin_inputs.emplace_back(mask_tensor);
// input_2 for plugin
std::vector<int> pos_id = {0};
int max_batch = 500;
for (int i = 1; i < max_batch; i++) {
pos_id.push_back(i);
}
nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
nvinfer1::ITensor* length_tensor =
GetEleTensorOfShape(input_shape_tensor, 1);
auto pos_id_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*fake_pos_id_tensor,
*length_tensor,
nvinfer1::ElementWiseOperation::kPROD);
// size = batch + 1;
nvinfer1::ITensor* batch_tensor =
GetEleTensorOfShape(input_shape_tensor, 0);
std::vector<int> const_data = {1};
nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
auto size_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*batch_tensor,
*const_tensor,
nvinfer1::ElementWiseOperation::kSUM);
// get size(batch + 1) data from pos_id_tensor
nvinfer1::Dims start;
nvinfer1::Dims stride;
nvinfer1::Dims size;
start.nbDims = 1;
stride.nbDims = 1;
size.nbDims = 1;
start.d[0] = 0;
stride.d[0] = 1;
size.d[0] = 1;
auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
slice_pos_layer->setInput(2, *size_layer->getOutput(0));
plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
// input_3 for plugin
std::vector<int> data(500, 1);
nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
slice_max_layer->setInput(2, *length_tensor);
plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
// plugin_layer
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
// add shuffle
auto* reshape_after_mha_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> reshape_tensor;
reshape_tensor.push_back(batch_tensor);
reshape_tensor.push_back(length_tensor);
reshape_tensor.push_back(Add1DConstantLayer(-1));
reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
reshape_after_mha_layer->setName(
("shuffle_last_multihead_matmul(Output: " + output_name + ")")
.c_str());
// return
layer = reshape_after_mha_layer;
} else {
PADDLE_ENFORCE_EQ(
input->getDimensions().nbDims,
3,
platform::errors::InvalidArgument(
"The Input dim of the MultiheadMatMul should be 3, "
"but it's (%d) now.",
input->getDimensions().nbDims));
// transpose weight_data from m * n to n * m
auto* input_bias_qk =
engine_->GetITensor(op_desc.Input("BiasQK").front());
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(weight_t->numel())};
weight.dims.assign({n, m});
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())};
// add shuffle before fc
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
if (op_desc.HasAttr("Input_scale")) {
engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
in_scale);
}
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
// add layer fc
nvinfer1::ILayer* fc_layer = nullptr;
if (op_desc.HasAttr("Input_scale")) {
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*reshape_before_fc_layer->getOutput(0),
n,
nv_ksize,
weight.get(),
bias.get());
} else {
fc_layer = fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected, FullyConnected,
*reshape_before_fc_layer->getOutput(0), *reshape_before_fc_layer->getOutput(0),
n, n,
weight.get(), weight,
bias.get()); bias);
}
// add shuffle for CustomQKVToContextPluginDynamic layer
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
mha_input_tensor_shape.push_back(
Add1DConstantLayer(hidden_out * 3)); // Q,K,V
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
reshape_after_fc_layer->setName(
("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
.c_str());
// add mha_plugin
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomQKVToContextPluginDynamic", "2");
assert(creator != nullptr);
// set the attributes of mha_plugin
int type = static_cast<int>(nvinfer1::DataType::kHALF);
int var_seqlen = 1;
bool has_mask = true;
std::vector<nvinfer1::PluginField> fields{
{"hidden_size",
&hidden_out,
nvinfer1::PluginFieldType::kINT32,
1},
{"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
{"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
{"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
{"var_seqlen",
&var_seqlen,
nvinfer1::PluginFieldType::kINT32,
1}};
nvinfer1::PluginFieldCollection* plugin_collection =
static_cast<nvinfer1::PluginFieldCollection*>(malloc(
sizeof(*plugin_collection) +
fields.size() *
sizeof(nvinfer1::PluginField))); // remember to free
plugin_collection->nbFields = static_cast<int>(fields.size());
plugin_collection->fields = fields.data();
auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
plugin_collection);
free(plugin_collection);
// set inputs
std::vector<nvinfer1::ITensor*> plugin_inputs;
// input_0 for plugin
plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
// input_1(fake) for plugin
std::vector<int> mask = {1};
nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
plugin_inputs.emplace_back(mask_tensor);
// input_2 for plugin
std::vector<int> pos_id = {0};
int max_batch = 500;
for (int i = 1; i < max_batch; i++) {
pos_id.push_back(i);
}
nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
nvinfer1::ITensor* length_tensor =
GetEleTensorOfShape(input_shape_tensor, 1);
auto pos_id_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*fake_pos_id_tensor,
*length_tensor,
nvinfer1::ElementWiseOperation::kPROD);
// size = batch + 1;
nvinfer1::ITensor* batch_tensor =
GetEleTensorOfShape(input_shape_tensor, 0);
std::vector<int> const_data = {1};
nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
auto size_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*batch_tensor,
*const_tensor,
nvinfer1::ElementWiseOperation::kSUM);
// get size(batch + 1) data from pos_id_tensor
nvinfer1::Dims start;
nvinfer1::Dims stride;
nvinfer1::Dims size;
start.nbDims = 1;
stride.nbDims = 1;
size.nbDims = 1;
start.d[0] = 0;
stride.d[0] = 1;
size.d[0] = 1;
auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
slice_pos_layer->setInput(2, *size_layer->getOutput(0));
plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
// input_3 for plugin
std::vector<int> data(500, 1);
nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
slice_max_layer->setInput(2, *length_tensor);
plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
// plugin_layer
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
// add shuffle
auto* reshape_after_mha_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *plugin_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> reshape_tensor;
reshape_tensor.push_back(batch_tensor);
reshape_tensor.push_back(length_tensor);
reshape_tensor.push_back(Add1DConstantLayer(-1));
reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
reshape_after_mha_layer->setName(
("shuffle_last_multihead_matmul(Output: " + output_name + ")")
.c_str());
if (op_desc.HasAttr("fc_out_threshold")) { // return
layer = reshape_after_mha_layer;
} else {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
op_desc.HasAttr("fc_out_threshold"), input->getDimensions().nbDims,
true, 3,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"must have out threshold in multihead layers in int8 mode")); "The Input dim of the MultiheadMatMul should be 3, "
float out_scale = "but it's (%d) now.",
PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); input->getDimensions().nbDims));
engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); // transpose weight_data from m * n to n * m
} auto* input_bias_qk =
fc_layer->setName( engine_->GetITensor(op_desc.Input("BiasQK").front());
("multihead_mamul_fc(Output: " + output_name + ")").c_str());
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(weight_t->numel())};
weight.dims.assign({n, m});
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())};
// add shuffle before fc
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
if (op_desc.HasAttr("Input_scale")) {
engine_->SetTensorDynamicRange(
reshape_before_fc_layer->getOutput(0), in_scale);
}
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
// add layer fc
nvinfer1::ILayer* fc_layer = nullptr;
if (op_desc.HasAttr("Input_scale")) {
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*reshape_before_fc_layer->getOutput(0),
n,
nv_ksize,
weight.get(),
bias.get());
} else {
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*reshape_before_fc_layer->getOutput(0),
n,
weight.get(),
bias.get());
}
if (op_desc.HasAttr("fc_out_threshold")) {
PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
true,
platform::errors::InvalidArgument(
"must have out threshold in multihead layers "
"in int8 mode"));
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
}
fc_layer->setName(
("multihead_mamul_fc(Output: " + output_name + ")").c_str());
// no need to add shuffle after fc, just change it in // no need to add shuffle after fc, just change it in
// QkvToContextPluginDynamic // QkvToContextPluginDynamic
// add qkv to context // add qkv to context
int head_size = hidden_out / head_number; int head_size = hidden_out / head_number;
float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha")); float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
std::vector<nvinfer1::ITensor*> plugin_inputs; std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_layer->getOutput(0)); plugin_inputs.push_back(fc_layer->getOutput(0));
plugin_inputs.push_back(input_bias_qk); plugin_inputs.push_back(input_bias_qk);
bool with_fp16 = bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
if (engine_->precision() == AnalysisConfig::Precision::kInt8) { if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
with_fp16 = true; with_fp16 = true;
}
plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(
hidden_in, head_number, head_size, scale, with_fp16);
layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
} }
plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(
hidden_in, head_number, head_size, scale, with_fp16);
layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
} }
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册