未验证 提交 16590799 编写于 作者: W Wangzheee 提交者: GitHub

[Paddle-Inference] Matmul_int8_convert: tensor*tensor (#37285)

* matmul_convert_int8

* matmul_convert_int8

* matmulconvert_int8

* Matmul_int8_convert: tensor*tensor

* Matmul_int8_convert: tensor*tensor

* Matmul_int8_convert: tensor*tensor
上级 025053b4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace paddle {
namespace framework {
......@@ -35,16 +36,26 @@ class MatMulOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(3) << "convert a fluid matmul op to tensorrt mul layer without bias";
VLOG(3) << "convert a fluid matmul op to tensorrt matmul layer ";
framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
nvinfer1::Dims dims_x = input1->getDimensions();
nvinfer1::Dims dims_y = input2->getDimensions();
bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
auto output_name = op_desc.Output("Out")[0];
float alpha = 1;
if (op_desc.HasAttr("alpha")) {
float alpha_tem = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
alpha = alpha_tem;
}
nvinfer1::MatrixOperation matrix_operation_X =
transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
......@@ -52,82 +63,122 @@ class MatMulOpConverter : public OpConverter {
transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
auto* layer =
TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1,
matrix_operation_X, *input2, matrix_operation_Y);
float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
auto output_name = op_desc.Output("Out")[0];
if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
engine_->SetITensor(output_name, layer->getOutput(0));
} else {
// IScaleLayer requires the input must have at least
// three dimensions in static shape mode and at least
// four dimensions in dynamic shape mode.
auto* matmul_out = layer->getOutput(0);
nvinfer1::Dims out_shape = matmul_out->getDimensions();
const int out_dims = out_shape.nbDims;
bool need_change_dim = false;
if (op_desc.HasAttr("support_int8") &&
engine_->precision() == AnalysisConfig::Precision::kInt8) {
if (engine_->with_dynamic_shape()) {
if (out_dims == 3) {
need_change_dim = true;
}
VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPluginDynamic* plugin =
new plugin::MatmulPluginDynamic(transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(layer, "matmul_op_int8_dynamic", {output_name},
test_mode);
} else {
if (out_dims == 2) {
need_change_dim = true;
}
VLOG(3) << "Convert a fluid matmul_op_int8_static to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPlugin* plugin = new plugin::MatmulPlugin(
dims_x, dims_y, transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddPluginV2IOExt(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(layer, "matmul_op_int8_static", {output_name},
test_mode);
}
if (need_change_dim) {
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = out_dims + 1;
reshape_dim.d[out_dims] = 1;
for (int i = 0; i < out_dims; i++) {
reshape_dim.d[i] = out_shape.d[i];
} else {
VLOG(3) << "Convert a fluid matmul_op_float to TensorRT ";
layer =
TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1,
matrix_operation_X, *input2, matrix_operation_Y);
if (alpha == 1) {
RreplenishLayerAndOutput(layer, "matmul_op_float_no_alpha",
{output_name}, test_mode);
} else {
layer->setName(
("matmul_op_float_has_alpha: MatrixMultiplyLayer (Output: " +
output_name + ")")
.c_str());
// IScaleLayer requires the input must have at least
// three dimensions in static shape mode and at least
// four dimensions in dynamic shape mode.
auto* matmul_out = layer->getOutput(0);
nvinfer1::Dims out_shape = matmul_out->getDimensions();
const int out_dims = out_shape.nbDims;
bool need_change_dim = false;
if (engine_->with_dynamic_shape()) {
if (out_dims == 3) {
need_change_dim = true;
}
} else {
if (out_dims == 2) {
need_change_dim = true;
}
}
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
reshape_layer->setReshapeDimensions(reshape_dim);
matmul_out = reshape_layer->getOutput(0);
}
if (need_change_dim) {
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = out_dims + 1;
reshape_dim.d[out_dims] = 1;
for (int i = 0; i < out_dims; i++) {
reshape_dim.d[i] = out_shape.d[i];
}
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
reshape_layer->setReshapeDimensions(reshape_dim);
matmul_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_before: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
}
auto create_weights = [&](float data, const std::string& type) -> float* {
std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
tmp_tensor->Resize({1});
auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
tmp_data[0] = data;
engine_->SetWeights(output_name + "_add_scale_op_" + type,
std::move(tmp_tensor));
return tmp_data;
};
float* alpha_data = create_weights(alpha, "alpha");
float* shift_data = create_weights(0.0, "shift");
float* power_data = create_weights(1.0, "power");
TensorRTEngine::Weight nv_alpha{nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data), 1};
TensorRTEngine::Weight nv_shift{nvinfer1::DataType::kFLOAT,
static_cast<void*>(shift_data), 1};
TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
static_cast<void*>(power_data), 1};
auto* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
nv_shift.get(), nv_alpha.get(), nv_power.get());
auto* scale_out = scale_layer->getOutput(0);
if (need_change_dim) {
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
reshape_layer->setReshapeDimensions(out_shape);
scale_out = reshape_layer->getOutput(0);
auto create_weights = [&](float data,
const std::string& type) -> float* {
std::unique_ptr<framework::Tensor> tmp_tensor(
new framework::Tensor());
tmp_tensor->Resize({1});
auto* tmp_data =
tmp_tensor->mutable_data<float>(platform::CPUPlace());
tmp_data[0] = data;
engine_->SetWeights(output_name + "_add_scale_op_" + type,
std::move(tmp_tensor));
return tmp_data;
};
float* alpha_data = create_weights(alpha, "alpha");
float* shift_data = create_weights(0.0, "shift");
float* power_data = create_weights(1.0, "power");
TensorRTEngine::Weight nv_alpha{nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data), 1};
TensorRTEngine::Weight nv_shift{nvinfer1::DataType::kFLOAT,
static_cast<void*>(shift_data), 1};
TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
static_cast<void*>(power_data), 1};
auto* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
nv_shift.get(), nv_alpha.get(), nv_power.get());
auto* scale_out = scale_layer->getOutput(0);
scale_layer->setName(
("matmul_op_float_has_alpha: ScaleLayer (Output: " + output_name +
")")
.c_str());
if (need_change_dim) {
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
reshape_layer->setReshapeDimensions(out_shape);
scale_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_after: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
}
engine_->SetITensor(output_name, scale_out);
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
engine_->SetITensor(output_name, scale_out);
}
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
......
......@@ -12,7 +12,8 @@ nv_library(tensorrt_plugin
mish_op_plugin.cu
pool3d_op_plugin.cu
deformable_conv_op_plugin.cu
DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
matmul_op_int8_plugin.cu
DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cassert>
#include <string>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class MatmulPlugin : public nvinfer1::IPluginV2IOExt {
public:
MatmulPlugin(nvinfer1::Dims const& dims_x, nvinfer1::Dims const& dims_y,
bool transA, bool transB, float alpha)
: dims_x_(dims_x),
dims_y_(dims_y),
transB_(transA),
transA_(transB),
alpha_(alpha) {}
MatmulPlugin(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_);
DeserializeValue(&serial_data, &serial_length, &transB_);
DeserializeValue(&serial_data, &serial_length, &transA_);
DeserializeValue(&serial_data, &serial_length, &alpha_);
DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
DeserializeValue(&serial_data, &serial_length, &alpha_one_);
DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
DeserializeValue(&serial_data, &serial_length, &batch_);
DeserializeValue(&serial_data, &serial_length, &k_);
DeserializeValue(&serial_data, &serial_length, &m_);
DeserializeValue(&serial_data, &serial_length, &n_);
DeserializeValue(&serial_data, &serial_length, &cublas_);
DeserializeValue(&serial_data, &serial_length, &type_);
DeserializeValue(&serial_data, &serial_length, &Adesc_);
DeserializeValue(&serial_data, &serial_length, &Bdesc_);
DeserializeValue(&serial_data, &serial_length, &Cdesc_);
DeserializeValue(&serial_data, &serial_length, &AtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &BtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &CtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &Atransform_);
DeserializeValue(&serial_data, &serial_length, &Btransform_);
DeserializeValue(&serial_data, &serial_length, &Ctransform_);
DeserializeValue(&serial_data, &serial_length, &transformDescT_);
DeserializeValue(&serial_data, &serial_length, &transformDescN_);
DeserializeValue(&serial_data, &serial_length, &matmulDesc_);
}
virtual bool isOutputBroadcastAcrossBatch(
int32_t output_index, const bool* input_is_broadcasted,
int32_t nb_inputs) const TRT_NOEXCEPT {
return false;
}
virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const
TRT_NOEXCEPT {
return false;
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
size_t getWorkspaceSize(int) const TRT_NOEXCEPT override { return 0; }
void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
name_space_ = plugin_namespace;
}
nvinfer1::IPluginV2IOExt* clone() const TRT_NOEXCEPT override {
MatmulPlugin* ptr =
new MatmulPlugin(dims_x_, dims_y_, transB_, transA_, alpha_);
ptr->setPluginNamespace(this->getPluginNamespace());
ptr->batch_ = batch_;
ptr->k_ = k_;
ptr->m_ = m_;
ptr->n_ = n_;
ptr->alpha_scale_ = alpha_scale_;
ptr->alpha_one_ = alpha_one_;
ptr->alpha_zero_ = alpha_zero_;
ptr->cublas_ = cublas_;
ptr->type_ = type_;
ptr->Adesc_ = Adesc_;
ptr->Bdesc_ = Bdesc_;
ptr->Cdesc_ = Cdesc_;
ptr->AtransformDesc_ = AtransformDesc_;
ptr->BtransformDesc_ = BtransformDesc_;
ptr->CtransformDesc_ = CtransformDesc_;
ptr->Atransform_ = Atransform_;
ptr->Btransform_ = Btransform_;
ptr->Ctransform_ = Ctransform_;
ptr->transformDescT_ = transformDescT_;
ptr->transformDescN_ = transformDescN_;
ptr->matmulDesc_ = matmulDesc_;
return ptr;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return name_space_.c_str();
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "matmul_int8_plugin";
}
nvinfer1::DataType getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* input_dims,
int num_inputs) TRT_NOEXCEPT override;
bool supportsFormatCombination(int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) const TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::PluginTensorDesc* in, int32_t nbInputs,
const nvinfer1::PluginTensorDesc* out,
int32_t nbOutputs) TRT_NOEXCEPT override;
/*
bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
const TRT_NOEXCEPT override;
*/
int initialize() TRT_NOEXCEPT { return 0; }
void terminate() TRT_NOEXCEPT;
#if IS_TRT_VERSION_LT(8000)
int enqueue(int batch_size, const void* const* inputs, void** outputs,
#else
int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
#endif
void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override { delete this; }
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
protected:
nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_;
bool transB_;
bool transA_;
float alpha_;
void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
int batch_;
uint64_t k_;
uint64_t m_;
uint64_t n_;
cublasLtHandle_t cublas_{nullptr};
nvinfer1::DataType type_;
cublasLtMatrixLayout_t Adesc_{nullptr}, Bdesc_{nullptr}, Cdesc_{nullptr};
cublasLtMatrixLayout_t AtransformDesc_{nullptr}, BtransformDesc_{nullptr},
CtransformDesc_{nullptr};
int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
cublasLtMatrixTransformDesc_t transformDescT_{nullptr},
transformDescN_{nullptr};
cublasLtMatmulDesc_t matmulDesc_{nullptr};
std::string name_space_;
size_t getSerializationSize() const TRT_NOEXCEPT override {
return SerializedSize(dims_x_) + SerializedSize(dims_y_) +
SerializedSize(transB_) + SerializedSize(transA_) +
SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
SerializedSize(batch_) + SerializedSize(k_) + SerializedSize(m_) +
SerializedSize(n_) + SerializedSize(cublas_) +
SerializedSize(type_) + SerializedSize(Adesc_) +
SerializedSize(Bdesc_) + SerializedSize(Cdesc_) +
SerializedSize(AtransformDesc_) + SerializedSize(BtransformDesc_) +
SerializedSize(CtransformDesc_) + SerializedSize(Atransform_) +
SerializedSize(Btransform_) + SerializedSize(Ctransform_) +
SerializedSize(transformDescT_) + SerializedSize(transformDescN_) +
SerializedSize(matmulDesc_);
}
void serialize(void* buffer) const TRT_NOEXCEPT override {
SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_);
SerializeValue(&buffer, transB_);
SerializeValue(&buffer, transA_);
SerializeValue(&buffer, alpha_);
SerializeValue(&buffer, alpha_scale_);
SerializeValue(&buffer, alpha_one_);
SerializeValue(&buffer, alpha_zero_);
SerializeValue(&buffer, batch_);
SerializeValue(&buffer, k_);
SerializeValue(&buffer, m_);
SerializeValue(&buffer, n_);
SerializeValue(&buffer, cublas_);
SerializeValue(&buffer, type_);
SerializeValue(&buffer, Adesc_);
SerializeValue(&buffer, Bdesc_);
SerializeValue(&buffer, Cdesc_);
SerializeValue(&buffer, AtransformDesc_);
SerializeValue(&buffer, BtransformDesc_);
SerializeValue(&buffer, CtransformDesc_);
SerializeValue(&buffer, Atransform_);
SerializeValue(&buffer, Btransform_);
SerializeValue(&buffer, Ctransform_);
SerializeValue(&buffer, transformDescT_);
SerializeValue(&buffer, transformDescN_);
SerializeValue(&buffer, matmulDesc_);
}
};
class MatmulPluginCreator : public nvinfer1::IPluginCreator {
public:
MatmulPluginCreator() {}
const char* getPluginName() const TRT_NOEXCEPT override {
return "matmul_int8_plugin";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
return &field_collection_;
}
nvinfer1::IPluginV2IOExt* createPlugin(
const char* name,
const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override {
return nullptr;
}
nvinfer1::IPluginV2IOExt* deserializePlugin(
const char* name, void const* serial_data,
size_t serial_length) TRT_NOEXCEPT override {
MatmulPlugin* obj = new MatmulPlugin(serial_data, serial_length);
obj->setPluginNamespace(name);
return obj;
}
void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(MatmulPluginCreator);
#if IS_TRT_VERSION_GE(6000)
class MatmulPluginDynamic : public DynamicPluginTensorRT {
public:
MatmulPluginDynamic(bool transA, bool transB, float alpha)
: transB_(transA), transA_(transB), alpha_(alpha) {}
MatmulPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &transB_);
DeserializeValue(&serial_data, &serial_length, &transA_);
DeserializeValue(&serial_data, &serial_length, &alpha_);
DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
DeserializeValue(&serial_data, &serial_length, &alpha_one_);
DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
DeserializeValue(&serial_data, &serial_length, &cublas_);
DeserializeValue(&serial_data, &serial_length, &Atransform_);
DeserializeValue(&serial_data, &serial_length, &Btransform_);
DeserializeValue(&serial_data, &serial_length, &Ctransform_);
DeserializeValue(&serial_data, &serial_length, &type_);
}
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
MatmulPluginDynamic* ptr =
new MatmulPluginDynamic(transB_, transA_, alpha_);
ptr->setPluginNamespace(this->getPluginNamespace());
ptr->alpha_scale_ = alpha_scale_;
ptr->alpha_one_ = alpha_one_;
ptr->alpha_zero_ = alpha_zero_;
ptr->cublas_ = cublas_;
ptr->Atransform_ = Atransform_;
ptr->Btransform_ = Btransform_;
ptr->Ctransform_ = Ctransform_;
ptr->type_ = type_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "matmul_int8_dynamic_plugin";
}
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
int initialize() TRT_NOEXCEPT { return 0; }
void terminate() TRT_NOEXCEPT;
nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override {
return 0;
}
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(
int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override { delete this; }
protected:
bool transB_;
bool transA_;
float alpha_;
void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
cublasLtHandle_t cublas_{nullptr};
nvinfer1::DataType type_;
int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
std::string name_space_;
size_t getSerializationSize() const TRT_NOEXCEPT override {
return SerializedSize(transB_) + SerializedSize(transA_) +
SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
SerializedSize(Atransform_) + SerializedSize(Btransform_) +
SerializedSize(Ctransform_) + SerializedSize(cublas_) +
SerializedSize(type_);
}
void serialize(void* buffer) const TRT_NOEXCEPT override {
SerializeValue(&buffer, transB_);
SerializeValue(&buffer, transA_);
SerializeValue(&buffer, alpha_);
SerializeValue(&buffer, alpha_scale_);
SerializeValue(&buffer, alpha_one_);
SerializeValue(&buffer, alpha_zero_);
SerializeValue(&buffer, Atransform_);
SerializeValue(&buffer, Btransform_);
SerializeValue(&buffer, Ctransform_);
SerializeValue(&buffer, cublas_);
SerializeValue(&buffer, type_);
}
};
class MatmulPluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
MatmulPluginDynamicCreator() {}
const char* getPluginName() const TRT_NOEXCEPT override {
return "matmul_int8_dynamic_plugin";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(const char* name,
const nvinfer1::PluginFieldCollection* fc)
TRT_NOEXCEPT override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(
const char* name, void const* serial_data,
size_t serial_length) TRT_NOEXCEPT override {
MatmulPluginDynamic* obj =
new MatmulPluginDynamic(serial_data, serial_length);
obj->setPluginNamespace(name);
return obj;
}
void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(MatmulPluginDynamicCreator);
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
if (NOT WITH_NV_JETSON)
list(APPEND CUDA_SRCS nvjpeg.cc)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cublasLt.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublasLt.h>
#include <cuda.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = \
paddle::platform::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
// APIs available after CUDA 10.1
// #if CUDA_VERSION >= 10100
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -30,10 +30,11 @@ DEFINE_string(cudnn_dir, "",
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
DEFINE_string(cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, "
"libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(
cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, libcublasLt "
"libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(nccl_dir, "",
"Specify path for loading nccl library, such as libnccl.so. "
......@@ -308,6 +309,19 @@ void* GetCublasDsoHandle() {
#endif
}
void* GetCublasLtDsoHandle() {
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
#else
std::string warning_msg(
"Your CUDA_VERSION less 10.1, not support CublasLt. "
"If you want to use CublasLt, please upgrade CUDA and rebuild "
"PaddlePaddle.");
return nullptr;
#endif
}
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string mac_warn_meg(
......
......@@ -26,6 +26,7 @@ namespace dynload {
#endif
void* GetCublasDsoHandle();
void* GetCublasLtDsoHandle();
void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle();
void* GetCurandDsoHandle();
......
......@@ -1997,10 +1997,12 @@ function gen_dockerfile() {
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
DOCKERFILE_CUBLAS_DSO=""
DOCKERFILE_CUBLASLT_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
DOCKERFILE_CUBLASLT_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so /usr/lib/x86_64-linux-gnu/libcublasLt.so"
fi
cat <<EOF
......@@ -2090,6 +2092,7 @@ EOF
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_CUBLAS_DSO}
${DOCKERFILE_CUBLASLT_DSO}
${DOCKERFILE_GPU_ENV}
EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -77,7 +77,7 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(
use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
......@@ -87,7 +87,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXTest(
def set_params(self):
self.transpose_x = True
self.transpose_y = False
self.alpha = 1.0
self.alpha = 2.1
class TensorRTMatMulQuantDequantDims3TransposeYTest(
......@@ -95,7 +95,7 @@ class TensorRTMatMulQuantDequantDims3TransposeYTest(
def set_params(self):
self.transpose_x = False
self.transpose_y = True
self.alpha = 1.0
self.alpha = 3.9
class TensorRTMatMulQuantDequantDims3TransposeXYTest(
......@@ -103,7 +103,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXYTest(
def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 1.0
self.alpha = 8.4
class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
......@@ -163,7 +163,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(
use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
......@@ -173,7 +173,7 @@ class TensorRTMatMulQuantDequantDims4TransposeXTest(
def set_params(self):
self.transpose_x = True
self.transpose_y = False
self.alpha = 1.0
self.alpha = 3.2
class TensorRTMatMulQuantDequantDims4TransposeYTest(
......@@ -181,7 +181,7 @@ class TensorRTMatMulQuantDequantDims4TransposeYTest(
def set_params(self):
self.transpose_x = False
self.transpose_y = True
self.alpha = 1.0
self.alpha = 7.5
class TensorRTMatMulQuantDequantDims4TransposeXYTest(
......@@ -189,16 +189,97 @@ class TensorRTMatMulQuantDequantDims4TransposeXYTest(
def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 1.0
self.alpha = 11.2
class TensorRTMatMulQuantDequantDims4ScaleTest(
TensorRTMatMulQuantDequantDims4Test):
class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
def setUp(self):
self.set_params()
def network():
self.data = fluid.data(
name='data', shape=[-1, 28, 28], dtype='float32')
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul(
x=self.data,
y=self.data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc(input=matmul_out,
size=10,
num_flatten_dims=1,
bias_attr=False,
act=None)
result = fluid.layers.relu(fc_out)
loss = fluid.layers.cross_entropy(input=result, label=self.label)
avg_loss = fluid.layers.mean(loss)
return avg_loss, result
self.main_program.random_seed = 2
self.startup_program.random_seed = 2
self.test_main_program.random_seed = 2
#self.test_startup_program.random_seed = 2
with fluid.unique_name.guard():
with fluid.program_guard(self.main_program, self.startup_program):
self.loss, result = network()
opt = fluid.optimizer.Adam(learning_rate=0.0001)
opt.minimize(self.loss)
with fluid.unique_name.guard():
with fluid.program_guard(self.test_main_program,
self.startup_program):
network()
self.feeds = {"data": np.random.random([3, 28, 28]).astype("float32")}
self.fetch_list = [result]
self.enable_trt = True
self.trt_parameters = TensorRTMatMulQuantDequantDims3DynamicTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False)
self.dynamic_shape_params = TensorRTMatMulQuantDequantDims3DynamicTest.DynamicShapeParam(
{
'data': [1, 28, 28]
}, {'data': [4, 28, 28]}, {'data': [3, 28, 28]}, False)
self.activation_quantize_type = 'moving_average_abs_max'
self.weight_quantize_type = 'channel_wise_abs_max'
def set_params(self):
self.transpose_x = False
self.transpose_y = False
self.alpha = 1.0
def test_check_output(self):
#self.quant_dequant()
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(
use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
class TensorRTMatMulQuantDequantDims4TransposeXDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = True
self.transpose_y = False
self.alpha = 2.0
class TensorRTMatMulQuantDequantDims4TransposeYDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = False
self.transpose_y = True
self.alpha = 2.2
class TensorRTMatMulQuantDequantDims4TransposeXYDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 7.8
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册