未验证 提交 ba82757e 编写于 作者: S Shang Zhizhou 提交者: GitHub

fix ernie fc shape error (#31803)

* fix conflict

* fix compile error

* cherry-pick #31316

* Refine cudnn softmax (#25757)

* refine cudnn softmax

* Trt elementwise plugin serialize (#31587)

* add serialize unittest

* fix element_op trt plugin serialize bug

* remove PassVersionChecker.IsCompatible

* fix unittest
Co-authored-by: NPei Yang <peiyang@baidu.com>
Co-authored-by: NGaoWei8 <53294385+GaoWei8@users.noreply.github.com>
上级 28c36d86
...@@ -147,11 +147,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -147,11 +147,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::set<std::string> output_names; std::set<std::string> output_names;
std::set<std::string> output_names_with_id; std::set<std::string> output_names_with_id;
std::vector<int> origin_output_dims; std::map<std::string, int> origin_name_output_dims;
for (auto *x : node->outputs) { for (auto *x : node->outputs) {
output_names.insert(x->Name()); output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id())); output_names_with_id.insert(x->Name() + std::to_string(x->id()));
origin_output_dims.push_back(x->Var()->GetShape().size()); origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
} }
std::unordered_map<std::string, std::string> output_name_map; std::unordered_map<std::string, std::string> output_name_map;
...@@ -195,9 +195,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -195,9 +195,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// output_mapping help us copy the data from the renamed ITensor // output_mapping help us copy the data from the renamed ITensor
// to Tensor. // to Tensor.
std::vector<std::string> output_mapping; std::vector<std::string> output_mapping;
std::vector<int> renamed_output_dims;
for (auto name : output_names) { for (auto name : output_names) {
PADDLE_ENFORCE(output_name_map.count(name) != 0); PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
renamed_output_dims.push_back(origin_name_output_dims[name]);
} }
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
...@@ -217,7 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -217,7 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("workspace_size", Get<int>("workspace_size")); op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id")); op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
op_desc->SetAttr("output_name_mapping", output_mapping); op_desc->SetAttr("output_name_mapping", output_mapping);
op_desc->SetAttr("origin_output_dims", origin_output_dims); op_desc->SetAttr("origin_output_dims", renamed_output_dims);
op_desc->SetAttr("parameters", params); op_desc->SetAttr("parameters", params);
// we record all inputs' shapes in attr to check if they are consistent // we record all inputs' shapes in attr to check if they are consistent
......
...@@ -133,7 +133,69 @@ class FcOpConverter : public OpConverter { ...@@ -133,7 +133,69 @@ class FcOpConverter : public OpConverter {
static_cast<size_t>(bias_num)}; static_cast<size_t>(bias_num)};
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
regist_fc(X, n_output, weight, bias); // not NCHW layout, but NLP layout with added 'x 1 x 1'
auto x_dim = X->getDimensions();
if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
auto output_name = op_desc.Output("Out").front();
// add shuffle before fc
nvinfer1::Dims reshape_before_fc_dim;
reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
for (int i = 0; i < x_dim.nbDims; i++) {
reshape_before_fc_dim.d[i] = 0;
}
reshape_before_fc_dim.d[x_dim.nbDims] = 1;
reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
reshape_before_fc_layer->setName(
("shuffle_before_fc(Output: " + output_name + ")").c_str());
// add fc layer
auto* fc_layer = TRT_ENGINE_ADD_LAYER(
engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
n_output, weight.get(), bias.get());
fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
// add shuffle after fc
nvinfer1::Dims reshape_after_fc_dim;
if (x_dim.nbDims == 3) {
if (x_num_col_dims == 2) {
reshape_after_fc_dim.nbDims = 3;
reshape_after_fc_dim.d[0] = 0;
reshape_after_fc_dim.d[1] = 0;
reshape_after_fc_dim.d[2] = 0;
} else {
reshape_after_fc_dim.nbDims = 2;
reshape_after_fc_dim.d[0] = 0;
auto dim = fc_layer->getOutput(0)->getDimensions();
reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
}
// x_dim.nbDims == 2
} else {
reshape_after_fc_dim.nbDims = 2;
reshape_after_fc_dim.d[0] = 0;
reshape_after_fc_dim.d[1] = 0;
}
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
if (activation_type == "relu") {
reshape_after_fc_layer->setName(
("shuffle_after_fc(Output: " + output_name + ")").c_str());
nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
{output_name}, test_mode);
} else {
RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
{output_name}, test_mode);
}
} else {
regist_fc(X, n_output, weight, bias);
}
return; return;
} }
// in order to handle situations in NLP models(input dims < 3, // in order to handle situations in NLP models(input dims < 3,
...@@ -143,12 +205,6 @@ class FcOpConverter : public OpConverter { ...@@ -143,12 +205,6 @@ class FcOpConverter : public OpConverter {
auto input_d = X->getDimensions().d; auto input_d = X->getDimensions().d;
int reshape_dim3[3] = {0}; int reshape_dim3[3] = {0};
int reshape_dim4[4] = {0}; int reshape_dim4[4] = {0};
PADDLE_ENFORCE_EQ(
x_num_col_dims == 1 || x_num_col_dims == 2, true,
platform::errors::InvalidArgument(
"Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter "
"expects x_num_col_dims is either 1 or 2, but got %d",
x_num_col_dims));
PADDLE_ENFORCE_LE(x_num_col_dims, input_dims, PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Params and input dims mismatch. Paddle-TRT FC " "Params and input dims mismatch. Paddle-TRT FC "
......
...@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0 ...@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See
See the License for the specific language governing permissions and the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
...@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
"network structure"; "network structure";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
// Shouble be a 5 dims tensor.
auto* input = engine_->GetITensor(op_desc.Input("Input").front()); auto* input = engine_->GetITensor(op_desc.Input("Input").front());
// fc weights and fc bias // fc weights and fc bias
...@@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
int head_number = boost::get<int>(op_desc.GetAttr("head_number")); int head_number = boost::get<int>(op_desc.GetAttr("head_number"));
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
auto output_name = op_desc.Output("Out")[0];
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
if (engine_->use_oss()) { if (engine_->use_oss()) {
...@@ -170,6 +170,12 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -170,6 +170,12 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs.data(), plugin_inputs.size(), *plugin); plugin_inputs.data(), plugin_inputs.size(), *plugin);
layer = plugin_layer; layer = plugin_layer;
} else { } else {
PADDLE_ENFORCE_EQ(
input->getDimensions().nbDims, 3,
platform::errors::InvalidArgument(
"The Input dim of the MultiheadMatMul should be 3, "
"but it's (%d) now.",
input->getDimensions().nbDims));
// transpose weight_data from m * n to n * m // transpose weight_data from m * n to n * m
auto* input_bias_qk = auto* input_bias_qk =
engine_->GetITensor(op_desc.Input("BiasQK").front()); engine_->GetITensor(op_desc.Input("BiasQK").front());
...@@ -183,15 +189,37 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -183,15 +189,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
static_cast<void*>(bias_data), static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())}; static_cast<size_t>(bias_t->numel())};
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, // add shuffle before fc
n, weight.get(), bias.get()); nvinfer1::Dims reshape_before_fc_dim;
auto* fc_out = fc_layer->getOutput(0); reshape_before_fc_dim.nbDims = 5;
reshape_before_fc_dim.d[0] = 0;
reshape_before_fc_dim.d[1] = 0;
reshape_before_fc_dim.d[2] = 0;
reshape_before_fc_dim.d[3] = 1;
reshape_before_fc_dim.d[4] = 1;
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
reshape_before_fc_layer->setName(
("shuffle_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
// add layer fc
auto* fc_layer = TRT_ENGINE_ADD_LAYER(
engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
weight.get(), bias.get());
fc_layer->setName(
("multihead_mamul_fc(Output: " + output_name + ")").c_str());
// no need to add shuffle after fc, just change it in
// QkvToContextPluginDynamic
// add qkv to context // add qkv to context
int head_size = hidden_out / head_number; int head_size = hidden_out / head_number;
float scale = boost::get<float>(op_desc.GetAttr("alpha")); float scale = boost::get<float>(op_desc.GetAttr("alpha"));
std::vector<nvinfer1::ITensor*> plugin_inputs; std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_out); plugin_inputs.push_back(fc_layer->getOutput(0));
plugin_inputs.push_back(input_bias_qk); plugin_inputs.push_back(input_bias_qk);
bool with_fp16 = bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
...@@ -207,7 +235,6 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -207,7 +235,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
"You can use the config.SetTRTDynamicShapeInfo(...) interface to set " "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
"the shape information to run the dynamic shape mode.")); "the shape information to run the dynamic shape mode."));
} }
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name}, RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
test_mode); test_mode);
#else #else
......
...@@ -48,6 +48,8 @@ class ScaleOpConverter : public OpConverter { ...@@ -48,6 +48,8 @@ class ScaleOpConverter : public OpConverter {
return tmp_data; return tmp_data;
}; };
int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
float* bias_ptr = create_weights(bias, "bias"); float* bias_ptr = create_weights(bias, "bias");
float* scale_ptr = create_weights(scale, "scale"); float* scale_ptr = create_weights(scale, "scale");
...@@ -60,19 +62,22 @@ class ScaleOpConverter : public OpConverter { ...@@ -60,19 +62,22 @@ class ScaleOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
auto input_dim = input->getDimensions(); auto input_dim = input->getDimensions();
PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
platform::errors::Fatal(
"Paddle-TRT scale mode only support dimension >= 3"));
nvinfer1::IShuffleLayer* expand_layer = nullptr; nvinfer1::IShuffleLayer* expand_layer = nullptr;
nvinfer1::IShuffleLayer* squeeze_layer = nullptr; nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
if (input_dim.nbDims == 3) { if (input_dim.nbDims < 3 + dynamic_shape_offset) {
// TensorRT scale layer is not supporting input dims < 4 when using nvinfer1::Dims expand_shape;
// explicit batch expand_shape.nbDims = 3 + dynamic_shape_offset;
for (int i = 0; i < 3 + dynamic_shape_offset; i++) {
if (i < input_dim.nbDims) {
expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
} else {
expand_shape.d[i] = 1;
}
}
expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
nvinfer1::Dims4 target_shape(0, 0, 0, 1); // expand 1 dims expand_layer->setReshapeDimensions(expand_shape);
expand_layer->setReshapeDimensions(target_shape);
input = expand_layer->getOutput(0); input = expand_layer->getOutput(0);
} }
...@@ -94,13 +99,15 @@ class ScaleOpConverter : public OpConverter { ...@@ -94,13 +99,15 @@ class ScaleOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(layer != nullptr, true, PADDLE_ENFORCE_EQ(layer != nullptr, true,
platform::errors::Fatal("Create scale layer failed.")); platform::errors::Fatal("Create scale layer failed."));
if (input_dim.nbDims == 3) { if (input_dim.nbDims < 3 + dynamic_shape_offset) {
// TensorRT scale layer is not supporting input dims < 4 when using nvinfer1::Dims squeeze_shape;
// explicit batch squeeze_shape.nbDims = input_dim.nbDims;
for (int i = 0; i < squeeze_shape.nbDims; i++) {
squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
}
squeeze_layer = squeeze_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
nvinfer1::Dims3 target_shape(0, 0, 0); // expand 1 dims squeeze_layer->setReshapeDimensions(squeeze_shape);
squeeze_layer->setReshapeDimensions(target_shape);
layer = static_cast<nvinfer1::ILayer*>(squeeze_layer); layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
} }
RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode); RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
......
...@@ -50,6 +50,7 @@ class SoftMaxOpConverter : public OpConverter { ...@@ -50,6 +50,7 @@ class SoftMaxOpConverter : public OpConverter {
uint32_t axes = std::max(0, input_dims - 3); uint32_t axes = std::max(0, input_dims - 3);
// TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers
// support Nd. // support Nd.
// Tips: Dynammic shape alreay fixes.
int padded_dims = 0; int padded_dims = 0;
int explicit_batch = 0; int explicit_batch = 0;
if (engine_->with_dynamic_shape()) explicit_batch = 1; if (engine_->with_dynamic_shape()) explicit_batch = 1;
...@@ -61,16 +62,16 @@ class SoftMaxOpConverter : public OpConverter { ...@@ -61,16 +62,16 @@ class SoftMaxOpConverter : public OpConverter {
} }
} }
if (!engine_->with_dynamic_shape()) { if (!engine_->with_dynamic_shape()) {
if (axis == -1) { if (axis < 0) {
axes = input_dims - 1 - padded_dims; axes = input_dims + axis - padded_dims;
} else { } else {
axes = axis; axes = axis - 1;
} }
} else { } else {
if (axis == -1) { if (axis < 0) {
axes = input_dims - 1 - padded_dims; axes = input_dims + axis;
} else { } else {
axes = axis + 1; axes = axis;
} }
} }
layer->setAxes(1 << axes); layer->setAxes(1 << axes);
......
...@@ -131,6 +131,18 @@ bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc, ...@@ -131,6 +131,18 @@ bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc,
} }
} }
} }
if (op_type == "fc" || op_type == "mul") {
const int x_num_col_dims =
desc.HasAttr("x_num_col_dims")
? boost::get<int>(desc.GetAttr("x_num_col_dims"))
: (desc.HasAttr("in_num_col_dims")
? boost::get<int>(desc.GetAttr("in_num_col_dims"))
: 1);
if (x_num_col_dims != 1 && x_num_col_dims != 2) {
return false;
}
}
if ((*teller)(op_type, desc, use_no_calib_int8)) return true; if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
} }
return false; return false;
......
...@@ -127,9 +127,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -127,9 +127,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
int ElementwisePluginDynamic::initialize() { return 0; } int ElementwisePluginDynamic::initialize() { return 0; }
size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; } size_t ElementwisePluginDynamic::getSerializationSize() const {
return SerializedSize(type_.c_str()) + SerializedSize(axis_);
}
void ElementwisePluginDynamic::serialize(void *buffer) const {} void ElementwisePluginDynamic::serialize(void *buffer) const {
SerializeValue(&buffer, type_.c_str());
SerializeValue(&buffer, axis_);
}
nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions( nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
......
...@@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { ...@@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit ElementwisePluginDynamic(const std::string& type, int axis) explicit ElementwisePluginDynamic(const std::string& type, int axis)
: type_(type), axis_(axis) {} : type_(type), axis_(axis) {}
ElementwisePluginDynamic(void const* serialData, size_t serialLength) {} ElementwisePluginDynamic(void const* serialData, size_t serialLength) {
const char* elementwise_type;
DeserializeValue(&serialData, &serialLength, &elementwise_type);
type_ = std::string(elementwise_type);
DeserializeValue(&serialData, &serialLength, &axis_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new ElementwisePluginDynamic(type_, axis_); return new ElementwisePluginDynamic(type_, axis_);
} }
...@@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { ...@@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
std::string type_; std::string type_;
int axis_; int axis_;
}; };
class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
public:
ElementwisePluginV2Creator() {}
const char* getPluginName() const override { return "elementwise_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new ElementwisePluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -182,12 +182,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions( ...@@ -182,12 +182,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
"but it's (%d)", "but it's (%d)",
output_index)); output_index));
nvinfer1::DimsExprs ret; nvinfer1::DimsExprs ret;
ret.nbDims = 5; ret.nbDims = 3;
ret.d[0] = inputs[0].d[0]; ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[0].d[1]; ret.d[1] = inputs[0].d[1];
ret.d[2] = expr_builder.constant(hidden_size_); ret.d[2] = expr_builder.constant(hidden_size_);
ret.d[3] = expr_builder.constant(1);
ret.d[4] = expr_builder.constant(1);
return ret; return ret;
} }
......
...@@ -158,12 +158,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( ...@@ -158,12 +158,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
"it has (%d) inputs", "it has (%d) inputs",
nb_inputs)); nb_inputs));
nvinfer1::DimsExprs ret; nvinfer1::DimsExprs ret;
ret.nbDims = 5; ret.nbDims = 3;
ret.d[0] = inputs[0].d[0]; ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[0].d[1]; ret.d[1] = inputs[0].d[1];
ret.d[2] = expr_builder.constant(head_size_ * head_number_); ret.d[2] = expr_builder.constant(head_size_ * head_number_);
ret.d[3] = expr_builder.constant(1);
ret.d[4] = expr_builder.constant(1);
return ret; return ret;
} }
......
...@@ -43,11 +43,6 @@ int SkipLayerNormPluginDynamic::initialize() { ...@@ -43,11 +43,6 @@ int SkipLayerNormPluginDynamic::initialize() {
nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { nvinfer1::IExprBuilder &expr_builder) {
PADDLE_ENFORCE_EQ(
inputs[0].nbDims, 5,
platform::errors::InvalidArgument(
"The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
inputs[0].nbDims));
return inputs[0]; return inputs[0];
} }
......
...@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( ...@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
output.d[1] = one; output.d[1] = one;
output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
*inputs[1].d[0], *one); *inputs[1].d[0], *one);
// remove padding 1
output.nbDims -= 2;
return output; return output;
} }
......
...@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run(config, &out_data); run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-4); EXPECT_NEAR(result[i], out_data[i], 3e-3);
} }
} }
......
...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static inline int SizeOutAxis(const int axis, DDim dims) {
int size = 1;
for (int i = axis + 1; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
template <typename T> template <typename T>
class SoftmaxCUDNNKernel : public framework::OpKernel<T> { class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = context.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* out_data = out->data<T>();
Out->mutable_data<T>(context.GetPlace());
auto dims = x->dims();
auto dims = X->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_x; const int dim = dims[axis];
framework::LoDTensor flattened_out; const int N = SizeToAxis(axis, dims);
flattened_x.ShareDataWith(*X).Resize(flattened_dims); const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
ScopedTensorDescriptor desc;
math::SoftmaxCUDNNFunctor<T>()( std::vector<int> tensor_dims = {N, dim, D, 1};
context.template device_context<platform::CUDADeviceContext>(), DataLayout layout = DataLayout::kNCHW;
&flattened_x, &flattened_out); cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data));
} }
}; };
template <typename T> template <typename T>
class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* Out = context.Input<Tensor>("Out"); auto* out = ctx.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* dx_data = dx->data<T>();
dX->mutable_data<T>(context.GetPlace());
auto dims = out->dims();
auto dims = Out->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_out; const int dim = dims[axis];
framework::LoDTensor flattened_d_out; const int N = SizeToAxis(axis, dims);
framework::LoDTensor flattened_d_x; const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); ScopedTensorDescriptor desc;
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW;
math::SoftmaxGradCUDNNFunctor<T>()( cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
context.template device_context<platform::CUDADeviceContext>(),
&flattened_out, &flattened_d_out, &flattened_d_x); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
} }
}; };
......
...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"Attr(axis) value should be in range [-R, R-1], " "Attr(axis) value should be in range [-R, R-1], "
"R is the rank of Input(X).")); "R is the rank of Input(X)."));
auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
if (axis != rank_x - 1 && axis != -1) {
PADDLE_ENFORCE_EQ(use_cudnn, false,
platform::errors::InvalidArgument(
"CUDNN kernel only support axis as -1."));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
class TRTScaleTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[-1, 512], dtype="float32")
scale_out = self.append_scale(data)
out = fluid.layers.batch_norm(scale_out, is_test=True)
self.feeds = {"data": np.random.random([1, 512]).astype("float32"), }
self.enable_trt = True
self.trt_parameters = TRTScaleTest.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out]
def append_scale(self, data):
return fluid.layers.scale(
x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
def test_check_output(self):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu, flatten=True)
if __name__ == "__main__":
unittest.main()
...@@ -654,6 +654,56 @@ class TensorRTSubgraphPassElementwiseMulTest( ...@@ -654,6 +654,56 @@ class TensorRTSubgraphPassElementwiseMulTest(
return fluid.layers.elementwise_mul(x=data1, y=data2) return fluid.layers.elementwise_mul(x=data1, y=data2)
class TensorRTSubgraphPassElementwiseSerializeTest(
TensorRTSubgraphPassElementwiseTest):
def setUp(self):
super(TensorRTSubgraphPassElementwiseSerializeTest, self).setUp()
self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
def test_check_output(self):
if os.path.exists(self.path + "_opt_cache"):
shutil.rmtree(self.path + "_opt_cache")
super(TensorRTSubgraphPassElementwiseSerializeTest,
self).test_check_output()
class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data1 = fluid.data(
name="data1", shape=[-1, 3, 64, 64], dtype="float32")
data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
eltwise_out = self.append_eltwise(data1, data2)
out = fluid.layers.batch_norm(eltwise_out, is_test=True)
self.feeds = {
"data1": np.random.random([1, 3, 64, 64]).astype("float32"),
"data2": np.random.random([64, 64]).astype("float32"),
}
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
self.dynamic_shape_params = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.DynamicShapeParam(
{
'data1': [1, 3, 8, 64],
'data2': [8, 64]
}, {'data1': [1, 3, 512, 64],
'data2':
[512, 64]}, {'data1': [1, 3, 256, 64],
'data2': [256, 64]}, False)
self.fetch_list = [out]
def append_eltwise(self, data1, data2):
return fluid.layers.elementwise_add(x=data1, y=data2)
def test_check_output(self):
if os.path.exists(self.path + "_opt_cache"):
shutil.rmtree(self.path + "_opt_cache")
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(use_gpu)
class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest): class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest):
def setUp(self): def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
......
...@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): ...@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
return [2, 3, 4, 5] return [2, 3, 4, 5]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self):
return 0
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self):
return 1
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp): class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
def get_x_shape(self): def get_x_shape(self):
return [2, 3, 4, 5] return [2, 3, 4, 5]
def get_axis(self):
return 2
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp6(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5]
def get_axis(self): def get_axis(self):
return 3 return 3
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp7(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp8(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 0
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp9(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 1
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp10(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 2
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp11(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 3
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp12(TestSoftmaxCUDNNOp):
def get_x_shape(self):
return [2, 3, 4, 5, 6]
def get_axis(self):
return 4
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestSoftmaxFP16Op(TestSoftmaxOp): class TestSoftmaxFP16Op(TestSoftmaxOp):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册