未验证 提交 fa06d9c3 编写于 作者: W Wangzheee 提交者: GitHub

fix_multihead (#45429)

上级 a5e9ccda
......@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs.data(), plugin_inputs.size(), *plugin);
layer = plugin_layer;
}
}
if (input_dims.d[1] <= 384 && !bias_qk_attr &&
engine_->precision() != AnalysisConfig::Precision::kFloat32) {
/*
* input_dims.d[0]: batch(-1)
* input_dims.d[1]: length:256
* input_dims.d[2]: hidden_size:768
input
|[b,256,768]
|
shuffle weight bias
|[b,256,768,1,1] | |
|_____________________|_________|
|
fc
|[b,256,2304,1,1]
|
shuffle mask(fake) pos max_length
|[b*256,2304,1,1] | | |
| | | |
|_______________________|_________|________|
|
MHA
|[b*256,768]
|
shuffle
|[b, 256, 768]
|
out
*/
nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<int32_t>(weight_t->numel())};
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())};
/*** transpose the weight and bias ***/
int head_size = hidden_out / head_number;
// [3, head_number, head_size, hidden_in] -> [head_number, 3,
// head_size, hidden_in]
auto transpose_weight_v2 = [](const float* src,
float* dst,
int three,
int head_number,
int head_size,
int hidden_in) {
const int HH = head_size * hidden_in;
for (int i = 0; i < three; ++i) {
for (int n = 0; n < head_number; ++n) {
for (int hh = 0; hh < HH; ++hh) {
dst[n * three * HH + i * HH + hh] =
src[i * head_number * HH + n * HH + hh];
} else {
if (input_dims.d[1] <= 384 && !bias_qk_attr &&
engine_->precision() != AnalysisConfig::Precision::kFloat32) {
/*
* input_dims.d[0]: batch(-1)
* input_dims.d[1]: length:256
* input_dims.d[2]: hidden_size:768
input
|[b,256,768]
|
shuffle weight bias
|[b,256,768,1,1] | |
|_____________________|_________|
|
fc
|[b,256,2304,1,1]
|
shuffle mask(fake) pos max_length
|[b*256,2304,1,1] | | |
| | | |
|_______________________|_________|________|
|
MHA
|[b*256,768]
|
shuffle
|[b, 256, 768]
|
out
*/
nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<int32_t>(weight_t->numel())};
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())};
/*** transpose the weight and bias ***/
int head_size = hidden_out / head_number;
// [3, head_number, head_size, hidden_in] -> [head_number, 3,
// head_size, hidden_in]
auto transpose_weight_v2 = [](const float* src,
float* dst,
int three,
int head_number,
int head_size,
int hidden_in) {
const int HH = head_size * hidden_in;
for (int i = 0; i < three; ++i) {
for (int n = 0; n < head_number; ++n) {
for (int hh = 0; hh < HH; ++hh) {
dst[n * three * HH + i * HH + hh] =
src[i * head_number * HH + n * HH + hh];
}
}
}
}
};
// [3, head_number, head_size] -> [head_number, 3, head_size]
auto transpose_bias_v2 =
[](const float* src, float* dst, int N, int H) {
for (int i = 0; i < 3; ++i) {
for (int n = 0; n < N; ++n) {
for (int h = 0; h < H; ++h) {
dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
};
// [3, head_number, head_size] -> [head_number, 3, head_size]
auto transpose_bias_v2 =
[](const float* src, float* dst, int N, int H) {
for (int i = 0; i < 3; ++i) {
for (int n = 0; n < N; ++n) {
for (int h = 0; h < H; ++h) {
dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
}
}
}
}
};
memcpy(weight_data_tmp.data(),
weight_data,
weight_t->numel() * sizeof(float));
transpose_weight_v2(weight_data_tmp.data(),
weight_data,
three,
head_number,
head_size,
hidden_in);
std::vector<float> bias_data_tmp;
bias_data_tmp.reserve(bias_t->numel());
memcpy(
bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
transpose_bias_v2(
bias_data_tmp.data(), bias_data, head_number, head_size);
// add shuffle for FullyConnected layer
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_fc_multihead_matmul(Output: " + output_name + ")")
.c_str());
// add fc layer
nvinfer1::ILayer* fc_layer = nullptr;
fc_layer = TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*reshape_before_fc_layer->getOutput(0),
n,
weight,
bias);
// add shuffle for CustomQKVToContextPluginDynamic layer
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
mha_input_tensor_shape.push_back(
Add1DConstantLayer(hidden_out * 3)); // Q,K,V
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
reshape_after_fc_layer->setName(
("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
.c_str());
// add mha_plugin
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomQKVToContextPluginDynamic", "2");
assert(creator != nullptr);
// set the attributes of mha_plugin
int type = static_cast<int>(nvinfer1::DataType::kHALF);
int var_seqlen = 1;
bool has_mask = true;
std::vector<nvinfer1::PluginField> fields{
{"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1},
{"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
{"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
{"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
{"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}};
nvinfer1::PluginFieldCollection* plugin_collection =
static_cast<nvinfer1::PluginFieldCollection*>(
malloc(sizeof(*plugin_collection) +
fields.size() *
sizeof(nvinfer1::PluginField))); // remember to free
plugin_collection->nbFields = static_cast<int>(fields.size());
plugin_collection->fields = fields.data();
auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
plugin_collection);
free(plugin_collection);
// set inputs
std::vector<nvinfer1::ITensor*> plugin_inputs;
// input_0 for plugin
plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
// input_1(fake) for plugin
std::vector<int> mask = {1};
nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
plugin_inputs.emplace_back(mask_tensor);
// input_2 for plugin
std::vector<int> pos_id = {0};
int max_batch = 500;
for (int i = 1; i < max_batch; i++) {
pos_id.push_back(i);
}
nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
nvinfer1::ITensor* length_tensor =
GetEleTensorOfShape(input_shape_tensor, 1);
auto pos_id_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*fake_pos_id_tensor,
*length_tensor,
nvinfer1::ElementWiseOperation::kPROD);
// size = batch + 1;
nvinfer1::ITensor* batch_tensor =
GetEleTensorOfShape(input_shape_tensor, 0);
std::vector<int> const_data = {1};
nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
auto size_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*batch_tensor,
*const_tensor,
nvinfer1::ElementWiseOperation::kSUM);
// get size(batch + 1) data from pos_id_tensor
nvinfer1::Dims start;
nvinfer1::Dims stride;
nvinfer1::Dims size;
start.nbDims = 1;
stride.nbDims = 1;
size.nbDims = 1;
start.d[0] = 0;
stride.d[0] = 1;
size.d[0] = 1;
auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
slice_pos_layer->setInput(2, *size_layer->getOutput(0));
plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
// input_3 for plugin
std::vector<int> data(500, 1);
nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
slice_max_layer->setInput(2, *length_tensor);
plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
// plugin_layer
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
// add shuffle
auto* reshape_after_mha_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> reshape_tensor;
reshape_tensor.push_back(batch_tensor);
reshape_tensor.push_back(length_tensor);
reshape_tensor.push_back(Add1DConstantLayer(-1));
reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
reshape_after_mha_layer->setName(
("shuffle_last_multihead_matmul(Output: " + output_name + ")")
.c_str());
// return
layer = reshape_after_mha_layer;
} else {
PADDLE_ENFORCE_EQ(
input->getDimensions().nbDims,
3,
platform::errors::InvalidArgument(
"The Input dim of the MultiheadMatMul should be 3, "
"but it's (%d) now.",
input->getDimensions().nbDims));
// transpose weight_data from m * n to n * m
auto* input_bias_qk =
engine_->GetITensor(op_desc.Input("BiasQK").front());
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(weight_t->numel())};
weight.dims.assign({n, m});
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())};
// add shuffle before fc
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
if (op_desc.HasAttr("Input_scale")) {
engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
in_scale);
}
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
// add layer fc
nvinfer1::ILayer* fc_layer = nullptr;
if (op_desc.HasAttr("Input_scale")) {
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*reshape_before_fc_layer->getOutput(0),
n,
nv_ksize,
weight.get(),
bias.get());
} else {
};
memcpy(weight_data_tmp.data(),
weight_data,
weight_t->numel() * sizeof(float));
transpose_weight_v2(weight_data_tmp.data(),
weight_data,
three,
head_number,
head_size,
hidden_in);
std::vector<float> bias_data_tmp;
bias_data_tmp.reserve(bias_t->numel());
memcpy(
bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
transpose_bias_v2(
bias_data_tmp.data(), bias_data, head_number, head_size);
// add shuffle for FullyConnected layer
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_fc_multihead_matmul(Output: " + output_name +
")")
.c_str());
// add fc layer
nvinfer1::ILayer* fc_layer = nullptr;
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*reshape_before_fc_layer->getOutput(0),
n,
weight.get(),
bias.get());
}
weight,
bias);
// add shuffle for CustomQKVToContextPluginDynamic layer
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
mha_input_tensor_shape.push_back(
Add1DConstantLayer(hidden_out * 3)); // Q,K,V
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
reshape_after_fc_layer->setName(
("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
.c_str());
// add mha_plugin
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomQKVToContextPluginDynamic", "2");
assert(creator != nullptr);
// set the attributes of mha_plugin
int type = static_cast<int>(nvinfer1::DataType::kHALF);
int var_seqlen = 1;
bool has_mask = true;
std::vector<nvinfer1::PluginField> fields{
{"hidden_size",
&hidden_out,
nvinfer1::PluginFieldType::kINT32,
1},
{"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
{"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
{"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
{"var_seqlen",
&var_seqlen,
nvinfer1::PluginFieldType::kINT32,
1}};
nvinfer1::PluginFieldCollection* plugin_collection =
static_cast<nvinfer1::PluginFieldCollection*>(malloc(
sizeof(*plugin_collection) +
fields.size() *
sizeof(nvinfer1::PluginField))); // remember to free
plugin_collection->nbFields = static_cast<int>(fields.size());
plugin_collection->fields = fields.data();
auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
plugin_collection);
free(plugin_collection);
// set inputs
std::vector<nvinfer1::ITensor*> plugin_inputs;
// input_0 for plugin
plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
// input_1(fake) for plugin
std::vector<int> mask = {1};
nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
plugin_inputs.emplace_back(mask_tensor);
// input_2 for plugin
std::vector<int> pos_id = {0};
int max_batch = 500;
for (int i = 1; i < max_batch; i++) {
pos_id.push_back(i);
}
nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
nvinfer1::ITensor* length_tensor =
GetEleTensorOfShape(input_shape_tensor, 1);
auto pos_id_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*fake_pos_id_tensor,
*length_tensor,
nvinfer1::ElementWiseOperation::kPROD);
// size = batch + 1;
nvinfer1::ITensor* batch_tensor =
GetEleTensorOfShape(input_shape_tensor, 0);
std::vector<int> const_data = {1};
nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
auto size_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*batch_tensor,
*const_tensor,
nvinfer1::ElementWiseOperation::kSUM);
// get size(batch + 1) data from pos_id_tensor
nvinfer1::Dims start;
nvinfer1::Dims stride;
nvinfer1::Dims size;
start.nbDims = 1;
stride.nbDims = 1;
size.nbDims = 1;
start.d[0] = 0;
stride.d[0] = 1;
size.d[0] = 1;
auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
slice_pos_layer->setInput(2, *size_layer->getOutput(0));
plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
// input_3 for plugin
std::vector<int> data(500, 1);
nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
slice_max_layer->setInput(2, *length_tensor);
plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
// plugin_layer
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
// add shuffle
auto* reshape_after_mha_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *plugin_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> reshape_tensor;
reshape_tensor.push_back(batch_tensor);
reshape_tensor.push_back(length_tensor);
reshape_tensor.push_back(Add1DConstantLayer(-1));
reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
reshape_after_mha_layer->setName(
("shuffle_last_multihead_matmul(Output: " + output_name + ")")
.c_str());
if (op_desc.HasAttr("fc_out_threshold")) {
// return
layer = reshape_after_mha_layer;
} else {
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("fc_out_threshold"),
true,
input->getDimensions().nbDims,
3,
platform::errors::InvalidArgument(
"must have out threshold in multihead layers in int8 mode"));
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
}
fc_layer->setName(
("multihead_mamul_fc(Output: " + output_name + ")").c_str());
"The Input dim of the MultiheadMatMul should be 3, "
"but it's (%d) now.",
input->getDimensions().nbDims));
// transpose weight_data from m * n to n * m
auto* input_bias_qk =
engine_->GetITensor(op_desc.Input("BiasQK").front());
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(weight_t->numel())};
weight.dims.assign({n, m});
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())};
// add shuffle before fc
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(input);
for (int i = 0; i < 5; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < 3; i++) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
if (op_desc.HasAttr("Input_scale")) {
engine_->SetTensorDynamicRange(
reshape_before_fc_layer->getOutput(0), in_scale);
}
reshape_before_fc_layer->setInput(
1, *Concat(reshape_before_fc_shape_tensor));
reshape_before_fc_layer->setName(
("shuffle_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
// add layer fc
nvinfer1::ILayer* fc_layer = nullptr;
if (op_desc.HasAttr("Input_scale")) {
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*reshape_before_fc_layer->getOutput(0),
n,
nv_ksize,
weight.get(),
bias.get());
} else {
fc_layer =
TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*reshape_before_fc_layer->getOutput(0),
n,
weight.get(),
bias.get());
}
if (op_desc.HasAttr("fc_out_threshold")) {
PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
true,
platform::errors::InvalidArgument(
"must have out threshold in multihead layers "
"in int8 mode"));
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
}
fc_layer->setName(
("multihead_mamul_fc(Output: " + output_name + ")").c_str());
// no need to add shuffle after fc, just change it in
// QkvToContextPluginDynamic
// no need to add shuffle after fc, just change it in
// QkvToContextPluginDynamic
// add qkv to context
int head_size = hidden_out / head_number;
float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
// add qkv to context
int head_size = hidden_out / head_number;
float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_layer->getOutput(0));
plugin_inputs.push_back(input_bias_qk);
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_layer->getOutput(0));
plugin_inputs.push_back(input_bias_qk);
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
with_fp16 = true;
if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
with_fp16 = true;
}
plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(
hidden_in, head_number, head_size, scale, with_fp16);
layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
}
plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(
hidden_in, head_number, head_size, scale, with_fp16);
layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
}
} else {
PADDLE_THROW(platform::errors::Fatal(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册