未验证 提交 16590799 编写于 作者: W Wangzheee 提交者: GitHub

[Paddle-Inference] Matmul_int8_convert: tensor*tensor (#37285)

* matmul_convert_int8

* matmul_convert_int8

* matmulconvert_int8

* Matmul_int8_convert: tensor*tensor

* Matmul_int8_convert: tensor*tensor

* Matmul_int8_convert: tensor*tensor
上级 025053b4
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -35,16 +36,26 @@ class MatMulOpConverter : public OpConverter { ...@@ -35,16 +36,26 @@ class MatMulOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(3) << "convert a fluid matmul op to tensorrt mul layer without bias"; VLOG(3) << "convert a fluid matmul op to tensorrt matmul layer ";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;
// Declare inputs // Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
nvinfer1::Dims dims_x = input1->getDimensions();
nvinfer1::Dims dims_y = input2->getDimensions();
bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X")); bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y")); bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
auto output_name = op_desc.Output("Out")[0];
float alpha = 1;
if (op_desc.HasAttr("alpha")) {
float alpha_tem = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
alpha = alpha_tem;
}
nvinfer1::MatrixOperation matrix_operation_X = nvinfer1::MatrixOperation matrix_operation_X =
transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE; : nvinfer1::MatrixOperation::kNONE;
...@@ -52,15 +63,40 @@ class MatMulOpConverter : public OpConverter { ...@@ -52,15 +63,40 @@ class MatMulOpConverter : public OpConverter {
transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE; : nvinfer1::MatrixOperation::kNONE;
auto* layer = if (op_desc.HasAttr("support_int8") &&
engine_->precision() == AnalysisConfig::Precision::kInt8) {
if (engine_->with_dynamic_shape()) {
VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPluginDynamic* plugin =
new plugin::MatmulPluginDynamic(transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(layer, "matmul_op_int8_dynamic", {output_name},
test_mode);
} else {
VLOG(3) << "Convert a fluid matmul_op_int8_static to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPlugin* plugin = new plugin::MatmulPlugin(
dims_x, dims_y, transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddPluginV2IOExt(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(layer, "matmul_op_int8_static", {output_name},
test_mode);
}
} else {
VLOG(3) << "Convert a fluid matmul_op_float to TensorRT ";
layer =
TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1, TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1,
matrix_operation_X, *input2, matrix_operation_Y); matrix_operation_X, *input2, matrix_operation_Y);
if (alpha == 1) {
float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); RreplenishLayerAndOutput(layer, "matmul_op_float_no_alpha",
auto output_name = op_desc.Output("Out")[0]; {output_name}, test_mode);
if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
engine_->SetITensor(output_name, layer->getOutput(0));
} else { } else {
layer->setName(
("matmul_op_float_has_alpha: MatrixMultiplyLayer (Output: " +
output_name + ")")
.c_str());
// IScaleLayer requires the input must have at least // IScaleLayer requires the input must have at least
// three dimensions in static shape mode and at least // three dimensions in static shape mode and at least
// four dimensions in dynamic shape mode. // four dimensions in dynamic shape mode.
...@@ -91,12 +127,19 @@ class MatMulOpConverter : public OpConverter { ...@@ -91,12 +127,19 @@ class MatMulOpConverter : public OpConverter {
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out); TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
reshape_layer->setReshapeDimensions(reshape_dim); reshape_layer->setReshapeDimensions(reshape_dim);
matmul_out = reshape_layer->getOutput(0); matmul_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_before: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
} }
auto create_weights = [&](float data, const std::string& type) -> float* { auto create_weights = [&](float data,
std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor()); const std::string& type) -> float* {
std::unique_ptr<framework::Tensor> tmp_tensor(
new framework::Tensor());
tmp_tensor->Resize({1}); tmp_tensor->Resize({1});
auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace()); auto* tmp_data =
tmp_tensor->mutable_data<float>(platform::CPUPlace());
tmp_data[0] = data; tmp_data[0] = data;
engine_->SetWeights(output_name + "_add_scale_op_" + type, engine_->SetWeights(output_name + "_add_scale_op_" + type,
std::move(tmp_tensor)); std::move(tmp_tensor));
...@@ -115,21 +158,29 @@ class MatMulOpConverter : public OpConverter { ...@@ -115,21 +158,29 @@ class MatMulOpConverter : public OpConverter {
engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM, engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
nv_shift.get(), nv_alpha.get(), nv_power.get()); nv_shift.get(), nv_alpha.get(), nv_power.get());
auto* scale_out = scale_layer->getOutput(0); auto* scale_out = scale_layer->getOutput(0);
scale_layer->setName(
("matmul_op_float_has_alpha: ScaleLayer (Output: " + output_name +
")")
.c_str());
if (need_change_dim) { if (need_change_dim) {
auto* reshape_layer = auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out); TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
reshape_layer->setReshapeDimensions(out_shape); reshape_layer->setReshapeDimensions(out_shape);
scale_out = reshape_layer->getOutput(0); scale_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_after: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
} }
engine_->SetITensor(output_name, scale_out); engine_->SetITensor(output_name, scale_out);
}
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
} }
}
}
}; };
} // namespace tensorrt } // namespace tensorrt
......
...@@ -12,6 +12,7 @@ nv_library(tensorrt_plugin ...@@ -12,6 +12,7 @@ nv_library(tensorrt_plugin
mish_op_plugin.cu mish_op_plugin.cu
pool3d_op_plugin.cu pool3d_op_plugin.cu
deformable_conv_op_plugin.cu deformable_conv_op_plugin.cu
matmul_op_int8_plugin.cu
DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace plf = paddle::platform;
namespace dyl = paddle::platform::dynload;
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
float zero = 0;
void Ltgemm_int8_linear(
cublasLtHandle_t ltHandle, const int8_t* A, cublasLtMatrixLayout_t Adesc,
int8_t* Atransform, cublasLtMatrixLayout_t AtransformDesc, bool transA_,
const int8_t* B, cublasLtMatrixLayout_t Bdesc, int8_t* Btransform,
cublasLtMatrixLayout_t BtransformDesc, bool transB_, int8_t* C,
cublasLtMatrixLayout_t Cdesc, int8_t* Ctransform,
cublasLtMatrixLayout_t CtransformDesc,
cublasLtMatrixTransformDesc_t transformDescT,
cublasLtMatrixTransformDesc_t transformDescN,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero,
void* alpha_one, void* workspace, cudaStream_t stream) {
if (transA_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr,
nullptr, Atransform, AtransformDesc, stream));
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr,
nullptr, Atransform, AtransformDesc, stream));
}
if (transB_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr,
nullptr, Btransform, BtransformDesc, stream));
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr,
nullptr, Btransform, BtransformDesc, stream));
}
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform,
BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform,
CtransformDesc, nullptr, workspace, 0, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc,
alpha_zero, nullptr, nullptr, C, Cdesc, stream));
}
void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A,
cublasLtMatrixLayout_t Adesc, const float* B,
cublasLtMatrixLayout_t Bdesc, float* C,
cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
void* alpha_zero, void* workspace,
cudaStream_t stream) {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
}
void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A,
cublasLtMatrixLayout_t Adesc, const half* B,
cublasLtMatrixLayout_t Bdesc, half* C,
cublasLtMatrixLayout_t Cdesc,
cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
void* alpha_zero, void* workspace,
cudaStream_t stream) {
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
}
nvinfer1::DataType MatmulPlugin::getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT {
return input_types[0];
}
nvinfer1::Dims MatmulPlugin::getOutputDimensions(
int index, const nvinfer1::Dims* input_dims, int num_inputs) TRT_NOEXCEPT {
if (transB_) {
m_ = dims_x_.d[dims_x_.nbDims - 1];
k_ = dims_x_.d[dims_x_.nbDims - 2];
} else {
m_ = dims_x_.d[dims_x_.nbDims - 2];
k_ = dims_x_.d[dims_x_.nbDims - 1];
}
if (transA_) {
n_ = dims_y_.d[dims_y_.nbDims - 2];
} else {
n_ = dims_y_.d[dims_y_.nbDims - 1];
}
batch_ = 1;
for (int i = 0; i < dims_x_.nbDims - 2; i++) {
batch_ *= dims_x_.d[i];
}
nvinfer1::Dims output_dims;
output_dims.nbDims = dims_x_.nbDims;
for (int i = 0; i < output_dims.nbDims - 2; i++) {
output_dims.d[i] = dims_x_.d[i];
}
output_dims.d[output_dims.nbDims - 2] = m_;
output_dims.d[output_dims.nbDims - 1] = n_;
return output_dims;
}
bool MatmulPlugin::supportsFormatCombination(
int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
int32_t nbOutputs) const TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2,
platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ",
nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ",
nbOutputs));
if (pos == 0) {
return (inOut[pos].type == nvinfer1::DataType::kHALF ||
inOut[pos].type == nvinfer1::DataType::kFLOAT ||
inOut[pos].type == nvinfer1::DataType::kINT8) &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
} else {
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == inOut[0].format;
}
}
void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
int32_t nbInputs,
const nvinfer1::PluginTensorDesc* out,
int32_t nbOutputs) TRT_NOEXCEPT {
float inscale_0 = inputs[0].scale;
float inscale_1 = inputs[1].scale;
float outscale = out[0].scale;
type_ = inputs[0].type;
int64_t stridea = k_ * n_;
int64_t strideb = k_ * m_;
int64_t stridec = m_ * n_;
cublasOperation_t AopTranspose, BopTranspose;
if (transA_) {
AopTranspose = CUBLAS_OP_T;
} else {
AopTranspose = CUBLAS_OP_N;
}
if (transB_) {
BopTranspose = CUBLAS_OP_T;
} else {
BopTranspose = CUBLAS_OP_N;
}
if (type_ == nvinfer1::DataType::kINT8) {
cudaDataType_t cudadataTypeIO = CUDA_R_8I;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32I;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
#endif
cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
int const ldatransform = 32 * n_;
int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8);
int const ldctransform = 32 * n_;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Atransform_,
sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Btransform_,
sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Ctransform_,
sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
cublasOperation_t Transpose = CUBLAS_OP_T;
cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescT_, cudaDataTypeS));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
sizeof(Transpose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
&transformDescN_, cudaDataTypeS));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model =
CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
sizeof(ATranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
sizeof(BTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
float alpha_tem[n_];
for (int i = 0; i < n_; i++) {
alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
}
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, n_ * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, alpha_tem, n_ * sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_zero_, sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
float one_tem = 1;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_16F;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
cudaMemcpyHostToDevice);
} else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32F;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
AopTranspose == CUBLAS_OP_N ? k_ : n_,
AopTranspose == CUBLAS_OP_N ? n_ : k_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
BopTranspose == CUBLAS_OP_N ? m_ : k_,
BopTranspose == CUBLAS_OP_N ? k_ : m_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc_, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
float alpha_tem = alpha_;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_zero_, sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
}
}
void MatmulPlugin::attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_);
}
void MatmulPlugin::detachFromContext() TRT_NOEXCEPT {
dyl::cublasLtDestroy(cublas_);
}
// When tensorrt engine freed ,there is "double free" ERROR. TODO@Wangzheee
void MatmulPlugin::terminate() TRT_NOEXCEPT {
/*
if(alpha_scale_){
cudaFree((void *)alpha_scale_);
alpha_scale_ = nullptr;
}
if(alpha_zero_){
cudaFree((void *)alpha_zero_);
alpha_zero_ = nullptr;
}
if(alpha_one_){
cudaFree((void *)alpha_one_);
alpha_one_ = nullptr;
}
if(Atransform_){
cudaFree((void *)Atransform_);
Atransform_ = nullptr;
}
if(Btransform_){
cudaFree((void *)Btransform_);
Btransform_ = nullptr;
}
if(Ctransform_){
cudaFree((void *)Ctransform_);
Ctransform_ = nullptr;
} */
}
int MatmulPlugin::enqueue(int batchSize, const void* const* inputs,
#if IS_TRT_VERSION_LT(8000)
void** outputs, void* workspace,
cudaStream_t stream) {
#else
void* const* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT {
#endif
if (type_ == nvinfer1::DataType::kINT8) {
const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear(
cublas_, A, Adesc_, Atransform_, AtransformDesc_, transA_, B, Bdesc_,
Btransform_, BtransformDesc_, transB_, C, Cdesc_, Ctransform_,
CtransformDesc_, transformDescT_, transformDescN_, matmulDesc_,
alpha_scale_, alpha_zero_, alpha_one_, workspace, stream);
} else if (type_ == nvinfer1::DataType::kFLOAT) {
const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_,
alpha_scale_, alpha_zero_, workspace, stream);
} else if (type_ == nvinfer1::DataType::kHALF) {
const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc_, B, Bdesc_, C, Cdesc_, matmulDesc_,
alpha_scale_, alpha_zero_, workspace, stream);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"VarMessageToVarType:Unsupported type"));
}
return cudaGetLastError() != cudaSuccess;
}
nvinfer1::DataType MatmulPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT {
return input_types[0];
}
nvinfer1::DimsExprs MatmulPluginDynamic::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs output_dims(inputs[0]);
if (transB_) {
output_dims.d[output_dims.nbDims - 2] = inputs[0].d[inputs[0].nbDims - 1];
} else {
output_dims.d[output_dims.nbDims - 2] = inputs[0].d[inputs[0].nbDims - 2];
}
if (transA_) {
output_dims.d[output_dims.nbDims - 1] = inputs[1].d[inputs[1].nbDims - 2];
} else {
output_dims.d[output_dims.nbDims - 1] = inputs[1].d[inputs[1].nbDims - 1];
}
return output_dims;
}
bool MatmulPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
int nbOutputs) TRT_NOEXCEPT {
PADDLE_ENFORCE_EQ(nbInputs, 2,
platform::errors::InvalidArgument("Must have 2 inputs, "
"but got %d input(s). ",
nbInputs));
PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
platform::errors::InvalidArgument("Must have 1 output, "
"but got %d output(s). ",
nbOutputs));
if (pos == 0) {
return (inOut[pos].type == nvinfer1::DataType::kHALF ||
inOut[pos].type == nvinfer1::DataType::kFLOAT ||
inOut[pos].type == nvinfer1::DataType::kINT8) &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
} else {
return inOut[pos].type == inOut[0].type &&
inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
}
void MatmulPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT {
float inscale_0 = inputs[0].desc.scale;
float inscale_1 = inputs[1].desc.scale;
float outscale = outputs[0].desc.scale;
type_ = inputs[0].desc.type;
uint64_t m_max, n_max, k_max;
if (transB_) {
m_max = inputs[0].max.d[inputs[0].max.nbDims - 1];
k_max = inputs[0].max.d[inputs[0].max.nbDims - 2];
} else {
m_max = inputs[0].max.d[inputs[0].max.nbDims - 2];
k_max = inputs[0].max.d[inputs[0].max.nbDims - 1];
}
if (transA_) {
n_max = inputs[1].max.d[inputs[1].max.nbDims - 2];
} else {
n_max = inputs[1].max.d[inputs[1].max.nbDims - 1];
}
int const ldatransform = 32 * n_max;
int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8);
int const ldctransform = 32 * n_max;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Atransform_,
sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Btransform_,
sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
(void**)&Ctransform_,
sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform));
if (type_ == nvinfer1::DataType::kINT8) {
float alpha_tem[n_max];
for (int i = 0; i < n_max; i++) {
alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
}
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, n_max * sizeof(float)));
cudaMemcpyAsync(alpha_scale_, alpha_tem, n_max * sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_zero_, sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
float one_tem = 1;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
cudaMemcpyHostToDevice);
} else if (type_ == nvinfer1::DataType::kHALF) {
half alpha_tem = static_cast<half>(alpha_);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, sizeof(half)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
cudaMemcpyHostToDevice);
half zero_tem = static_cast<half>(zero);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
cudaMemcpyHostToDevice);
} else {
float alpha_tem = alpha_;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_scale_, sizeof(float)));
cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
cudaMemcpyHostToDevice);
float zero_tem = zero;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMalloc((void**)&alpha_zero_, sizeof(float)));
cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
cudaMemcpyHostToDevice);
}
}
void MatmulPluginDynamic::attachToContext(
cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
dyl::cublasLtCreate(&cublas_);
}
void MatmulPluginDynamic::detachFromContext() TRT_NOEXCEPT {
dyl::cublasLtDestroy(cublas_);
}
// When tensorrt engine freed ,there is "double free" ERROR. TODO@Wangzheee
void MatmulPluginDynamic::terminate() TRT_NOEXCEPT {
/*if(alpha_scale_){
cudaFree((void *)alpha_scale_);
alpha_scale_ = nullptr;
}
if(alpha_zero_){
cudaFree((void *)alpha_zero_);
alpha_zero_ = nullptr;
}
if(alpha_one_){
cudaFree((void *)alpha_one_);
alpha_one_ = nullptr;
}
if(Atransform_){
cudaFree((void *)Atransform_);
Atransform_ = nullptr;
}
if(Btransform_){
cudaFree((void *)Btransform_);
Btransform_ = nullptr;
}
if(Ctransform_){
cudaFree((void *)Ctransform_);
Ctransform_ = nullptr;
} */
}
int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs,
void* const* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT {
const auto Input0Desc = inputDesc[0];
const auto Input1Desc = inputDesc[1];
uint64_t m, n, k;
if (transB_) {
m = Input0Desc.dims.d[Input0Desc.dims.nbDims - 1];
k = Input0Desc.dims.d[Input0Desc.dims.nbDims - 2];
} else {
m = Input0Desc.dims.d[Input0Desc.dims.nbDims - 2];
k = Input0Desc.dims.d[Input0Desc.dims.nbDims - 1];
}
if (transA_) {
n = Input1Desc.dims.d[Input1Desc.dims.nbDims - 2];
} else {
n = Input1Desc.dims.d[Input1Desc.dims.nbDims - 1];
}
int batch = 1;
for (int i = 0; i < Input0Desc.dims.nbDims - 2; i++) {
batch *= Input0Desc.dims.d[i];
}
int const ldatransform = 32 * n;
int const ldbtransform = 32 * ((m + 8 - 1) / 8 * 8);
int const ldctransform = 32 * n;
int64_t stridea = k * n;
int64_t strideb = k * m;
int64_t stridec = m * n;
cublasOperation_t AopTranspose, BopTranspose;
if (transA_) {
AopTranspose = CUBLAS_OP_T;
} else {
AopTranspose = CUBLAS_OP_N;
}
if (transB_) {
BopTranspose = CUBLAS_OP_T;
} else {
BopTranspose = CUBLAS_OP_N;
}
cublasLtMatrixLayout_t Adesc{nullptr}, Bdesc{nullptr}, Cdesc{nullptr};
cublasLtMatmulDesc_t matmulDesc{nullptr};
cublasLtMatrixLayout_t AtransformDesc{nullptr}, BtransformDesc{nullptr},
CtransformDesc{nullptr};
int8_t *Atransform{nullptr}, *Btransform{nullptr}, *Ctransform{nullptr};
cublasLtMatrixTransformDesc_t transformDescT{nullptr},
transformDescN{nullptr};
if (type_ == nvinfer1::DataType::kINT8) {
cudaDataType_t cudadataTypeIO = CUDA_R_8I;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32I;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
#endif
cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&AtransformDesc, cudadataTypeIO, n, k, ldatransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&BtransformDesc, cudadataTypeIO, m, k, ldbtransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
sizeof(COL4_4R2_8C)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&CtransformDesc, cudadataTypeIO, n, m, ldctransform));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
cublasOperation_t Transpose = CUBLAS_OP_T;
cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
sizeof(Transpose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
&cudaDataTypeS, sizeof(cudaDataTypeS)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
&transform_model, sizeof(transform_model)));
cublasOperation_t ATranspose = CUBLAS_OP_N, BTranspose = CUBLAS_OP_T;
cublasLtPointerMode_t matmul_model =
CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
sizeof(ATranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
sizeof(BTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
const int8_t* B = static_cast<const int8_t*>(inputs[0]);
const int8_t* A = static_cast<const int8_t*>(inputs[1]);
int8_t* C = static_cast<int8_t*>(outputs[0]);
Ltgemm_int8_linear(cublas_, A, Adesc, Atransform_, AtransformDesc, transA_,
B, Bdesc, Btransform_, BtransformDesc, transB_, C, Cdesc,
Ctransform_, CtransformDesc, transformDescT,
transformDescN, matmulDesc, alpha_scale_, alpha_zero_,
alpha_one_, workspace, stream);
} else if (type_ == nvinfer1::DataType::kHALF) {
cudaDataType_t cudadataTypeIO = CUDA_R_16F;
cudaDataType_t cudaDataTypeS = CUDA_R_16F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_16F;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
const half* B = static_cast<const half*>(inputs[0]);
const half* A = static_cast<const half*>(inputs[1]);
half* C = static_cast<half*>(outputs[0]);
Ltgemm_fp16_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc,
alpha_scale_, alpha_zero_, workspace, stream);
} else {
cudaDataType_t cudadataTypeIO = CUDA_R_32F;
cudaDataType_t cudaDataTypeS = CUDA_R_32F;
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32F;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
AopTranspose == CUBLAS_OP_N ? k : n,
AopTranspose == CUBLAS_OP_N ? n : k));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
sizeof(stridea)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
&Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
BopTranspose == CUBLAS_OP_N ? m : k,
BopTranspose == CUBLAS_OP_N ? k : m));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
sizeof(strideb)));
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
sizeof(cudadataTypeIO)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
sizeof(stridec)));
cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
#if CUBLAS_VER_MAJOR < 11
PADDLE_ENFORCE_CUDA_SUCCESS(
dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
&matmulDesc, cudaComputeType, cudaDataTypeS));
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
sizeof(AopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
sizeof(BopTranspose)));
PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
sizeof(matmul_model)));
const float* B = static_cast<const float*>(inputs[0]);
const float* A = static_cast<const float*>(inputs[1]);
float* C = static_cast<float*>(outputs[0]);
Ltgemm_fp32_linear(cublas_, A, Adesc, B, Bdesc, C, Cdesc, matmulDesc,
alpha_scale_, alpha_zero_, workspace, stream);
}
return cudaGetLastError() != cudaSuccess;
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cassert>
#include <string>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class MatmulPlugin : public nvinfer1::IPluginV2IOExt {
public:
MatmulPlugin(nvinfer1::Dims const& dims_x, nvinfer1::Dims const& dims_y,
bool transA, bool transB, float alpha)
: dims_x_(dims_x),
dims_y_(dims_y),
transB_(transA),
transA_(transB),
alpha_(alpha) {}
MatmulPlugin(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_);
DeserializeValue(&serial_data, &serial_length, &transB_);
DeserializeValue(&serial_data, &serial_length, &transA_);
DeserializeValue(&serial_data, &serial_length, &alpha_);
DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
DeserializeValue(&serial_data, &serial_length, &alpha_one_);
DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
DeserializeValue(&serial_data, &serial_length, &batch_);
DeserializeValue(&serial_data, &serial_length, &k_);
DeserializeValue(&serial_data, &serial_length, &m_);
DeserializeValue(&serial_data, &serial_length, &n_);
DeserializeValue(&serial_data, &serial_length, &cublas_);
DeserializeValue(&serial_data, &serial_length, &type_);
DeserializeValue(&serial_data, &serial_length, &Adesc_);
DeserializeValue(&serial_data, &serial_length, &Bdesc_);
DeserializeValue(&serial_data, &serial_length, &Cdesc_);
DeserializeValue(&serial_data, &serial_length, &AtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &BtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &CtransformDesc_);
DeserializeValue(&serial_data, &serial_length, &Atransform_);
DeserializeValue(&serial_data, &serial_length, &Btransform_);
DeserializeValue(&serial_data, &serial_length, &Ctransform_);
DeserializeValue(&serial_data, &serial_length, &transformDescT_);
DeserializeValue(&serial_data, &serial_length, &transformDescN_);
DeserializeValue(&serial_data, &serial_length, &matmulDesc_);
}
virtual bool isOutputBroadcastAcrossBatch(
int32_t output_index, const bool* input_is_broadcasted,
int32_t nb_inputs) const TRT_NOEXCEPT {
return false;
}
virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const
TRT_NOEXCEPT {
return false;
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
size_t getWorkspaceSize(int) const TRT_NOEXCEPT override { return 0; }
void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
name_space_ = plugin_namespace;
}
nvinfer1::IPluginV2IOExt* clone() const TRT_NOEXCEPT override {
MatmulPlugin* ptr =
new MatmulPlugin(dims_x_, dims_y_, transB_, transA_, alpha_);
ptr->setPluginNamespace(this->getPluginNamespace());
ptr->batch_ = batch_;
ptr->k_ = k_;
ptr->m_ = m_;
ptr->n_ = n_;
ptr->alpha_scale_ = alpha_scale_;
ptr->alpha_one_ = alpha_one_;
ptr->alpha_zero_ = alpha_zero_;
ptr->cublas_ = cublas_;
ptr->type_ = type_;
ptr->Adesc_ = Adesc_;
ptr->Bdesc_ = Bdesc_;
ptr->Cdesc_ = Cdesc_;
ptr->AtransformDesc_ = AtransformDesc_;
ptr->BtransformDesc_ = BtransformDesc_;
ptr->CtransformDesc_ = CtransformDesc_;
ptr->Atransform_ = Atransform_;
ptr->Btransform_ = Btransform_;
ptr->Ctransform_ = Ctransform_;
ptr->transformDescT_ = transformDescT_;
ptr->transformDescN_ = transformDescN_;
ptr->matmulDesc_ = matmulDesc_;
return ptr;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return name_space_.c_str();
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "matmul_int8_plugin";
}
nvinfer1::DataType getOutputDataType(
int index, const nvinfer1::DataType* input_types,
int nb_inputs) const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* input_dims,
int num_inputs) TRT_NOEXCEPT override;
bool supportsFormatCombination(int32_t pos,
nvinfer1::PluginTensorDesc const* inOut,
int32_t nbInputs,
int32_t nbOutputs) const TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::PluginTensorDesc* in, int32_t nbInputs,
const nvinfer1::PluginTensorDesc* out,
int32_t nbOutputs) TRT_NOEXCEPT override;
/*
bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
const TRT_NOEXCEPT override;
*/
int initialize() TRT_NOEXCEPT { return 0; }
void terminate() TRT_NOEXCEPT;
#if IS_TRT_VERSION_LT(8000)
int enqueue(int batch_size, const void* const* inputs, void** outputs,
#else
int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
#endif
void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override { delete this; }
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
protected:
nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_;
bool transB_;
bool transA_;
float alpha_;
void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
int batch_;
uint64_t k_;
uint64_t m_;
uint64_t n_;
cublasLtHandle_t cublas_{nullptr};
nvinfer1::DataType type_;
cublasLtMatrixLayout_t Adesc_{nullptr}, Bdesc_{nullptr}, Cdesc_{nullptr};
cublasLtMatrixLayout_t AtransformDesc_{nullptr}, BtransformDesc_{nullptr},
CtransformDesc_{nullptr};
int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
cublasLtMatrixTransformDesc_t transformDescT_{nullptr},
transformDescN_{nullptr};
cublasLtMatmulDesc_t matmulDesc_{nullptr};
std::string name_space_;
size_t getSerializationSize() const TRT_NOEXCEPT override {
return SerializedSize(dims_x_) + SerializedSize(dims_y_) +
SerializedSize(transB_) + SerializedSize(transA_) +
SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
SerializedSize(batch_) + SerializedSize(k_) + SerializedSize(m_) +
SerializedSize(n_) + SerializedSize(cublas_) +
SerializedSize(type_) + SerializedSize(Adesc_) +
SerializedSize(Bdesc_) + SerializedSize(Cdesc_) +
SerializedSize(AtransformDesc_) + SerializedSize(BtransformDesc_) +
SerializedSize(CtransformDesc_) + SerializedSize(Atransform_) +
SerializedSize(Btransform_) + SerializedSize(Ctransform_) +
SerializedSize(transformDescT_) + SerializedSize(transformDescN_) +
SerializedSize(matmulDesc_);
}
void serialize(void* buffer) const TRT_NOEXCEPT override {
SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_);
SerializeValue(&buffer, transB_);
SerializeValue(&buffer, transA_);
SerializeValue(&buffer, alpha_);
SerializeValue(&buffer, alpha_scale_);
SerializeValue(&buffer, alpha_one_);
SerializeValue(&buffer, alpha_zero_);
SerializeValue(&buffer, batch_);
SerializeValue(&buffer, k_);
SerializeValue(&buffer, m_);
SerializeValue(&buffer, n_);
SerializeValue(&buffer, cublas_);
SerializeValue(&buffer, type_);
SerializeValue(&buffer, Adesc_);
SerializeValue(&buffer, Bdesc_);
SerializeValue(&buffer, Cdesc_);
SerializeValue(&buffer, AtransformDesc_);
SerializeValue(&buffer, BtransformDesc_);
SerializeValue(&buffer, CtransformDesc_);
SerializeValue(&buffer, Atransform_);
SerializeValue(&buffer, Btransform_);
SerializeValue(&buffer, Ctransform_);
SerializeValue(&buffer, transformDescT_);
SerializeValue(&buffer, transformDescN_);
SerializeValue(&buffer, matmulDesc_);
}
};
class MatmulPluginCreator : public nvinfer1::IPluginCreator {
public:
MatmulPluginCreator() {}
const char* getPluginName() const TRT_NOEXCEPT override {
return "matmul_int8_plugin";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
return &field_collection_;
}
nvinfer1::IPluginV2IOExt* createPlugin(
const char* name,
const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override {
return nullptr;
}
nvinfer1::IPluginV2IOExt* deserializePlugin(
const char* name, void const* serial_data,
size_t serial_length) TRT_NOEXCEPT override {
MatmulPlugin* obj = new MatmulPlugin(serial_data, serial_length);
obj->setPluginNamespace(name);
return obj;
}
void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(MatmulPluginCreator);
#if IS_TRT_VERSION_GE(6000)
class MatmulPluginDynamic : public DynamicPluginTensorRT {
public:
MatmulPluginDynamic(bool transA, bool transB, float alpha)
: transB_(transA), transA_(transB), alpha_(alpha) {}
MatmulPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &transB_);
DeserializeValue(&serial_data, &serial_length, &transA_);
DeserializeValue(&serial_data, &serial_length, &alpha_);
DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
DeserializeValue(&serial_data, &serial_length, &alpha_one_);
DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
DeserializeValue(&serial_data, &serial_length, &cublas_);
DeserializeValue(&serial_data, &serial_length, &Atransform_);
DeserializeValue(&serial_data, &serial_length, &Btransform_);
DeserializeValue(&serial_data, &serial_length, &Ctransform_);
DeserializeValue(&serial_data, &serial_length, &type_);
}
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
MatmulPluginDynamic* ptr =
new MatmulPluginDynamic(transB_, transA_, alpha_);
ptr->setPluginNamespace(this->getPluginNamespace());
ptr->alpha_scale_ = alpha_scale_;
ptr->alpha_one_ = alpha_one_;
ptr->alpha_zero_ = alpha_zero_;
ptr->cublas_ = cublas_;
ptr->Atransform_ = Atransform_;
ptr->Btransform_ = Btransform_;
ptr->Ctransform_ = Ctransform_;
ptr->type_ = type_;
return ptr;
}
const char* getPluginType() const TRT_NOEXCEPT override {
return "matmul_int8_dynamic_plugin";
}
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
int initialize() TRT_NOEXCEPT { return 0; }
void terminate() TRT_NOEXCEPT;
nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override {
return 0;
}
void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
nvinfer1::IGpuAllocator* gpuAllocator)
TRT_NOEXCEPT override;
void detachFromContext() TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(
int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
void destroy() TRT_NOEXCEPT override { delete this; }
protected:
bool transB_;
bool transA_;
float alpha_;
void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
cublasLtHandle_t cublas_{nullptr};
nvinfer1::DataType type_;
int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
std::string name_space_;
size_t getSerializationSize() const TRT_NOEXCEPT override {
return SerializedSize(transB_) + SerializedSize(transA_) +
SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
SerializedSize(Atransform_) + SerializedSize(Btransform_) +
SerializedSize(Ctransform_) + SerializedSize(cublas_) +
SerializedSize(type_);
}
void serialize(void* buffer) const TRT_NOEXCEPT override {
SerializeValue(&buffer, transB_);
SerializeValue(&buffer, transA_);
SerializeValue(&buffer, alpha_);
SerializeValue(&buffer, alpha_scale_);
SerializeValue(&buffer, alpha_one_);
SerializeValue(&buffer, alpha_zero_);
SerializeValue(&buffer, Atransform_);
SerializeValue(&buffer, Btransform_);
SerializeValue(&buffer, Ctransform_);
SerializeValue(&buffer, cublas_);
SerializeValue(&buffer, type_);
}
};
class MatmulPluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
MatmulPluginDynamicCreator() {}
const char* getPluginName() const TRT_NOEXCEPT override {
return "matmul_int8_dynamic_plugin";
}
const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(const char* name,
const nvinfer1::PluginFieldCollection* fc)
TRT_NOEXCEPT override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(
const char* name, void const* serial_data,
size_t serial_length) TRT_NOEXCEPT override {
MatmulPluginDynamic* obj =
new MatmulPluginDynamic(serial_data, serial_length);
obj->setPluginNamespace(name);
return obj;
}
void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const TRT_NOEXCEPT override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(MatmulPluginDynamicCreator);
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc)
if (NOT WITH_NV_JETSON) if (NOT WITH_NV_JETSON)
list(APPEND CUDA_SRCS nvjpeg.cc) list(APPEND CUDA_SRCS nvjpeg.cc)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/cublasLt.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cublasLt_dso_flag;
void *cublasLt_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublasLt.h>
#include <cuda.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cublasLt_dso_flag;
extern void *cublasLt_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cublasLt routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using cublasLt_func = \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(cublasLt_dso_flag, []() { \
cublasLt_dso_handle = \
paddle::platform::dynload::GetCublasLtDsoHandle(); \
}); \
static void *p_##__name = dlsym(cublasLt_dso_handle, #__name); \
return reinterpret_cast<cublasLt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
// APIs available after CUDA 10.1
// #if CUDA_VERSION >= 10100
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
...@@ -30,8 +30,9 @@ DEFINE_string(cudnn_dir, "", ...@@ -30,8 +30,9 @@ DEFINE_string(cudnn_dir, "",
"/usr/local/cudnn/lib. If empty [default], dlopen " "/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH"); "will search cudnn from LD_LIBRARY_PATH");
DEFINE_string(cuda_dir, "", DEFINE_string(
"Specify path for loading cuda library, such as libcublas, " cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, libcublasLt "
"libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
"If default, dlopen will search cuda from LD_LIBRARY_PATH"); "If default, dlopen will search cuda from LD_LIBRARY_PATH");
...@@ -308,6 +309,19 @@ void* GetCublasDsoHandle() { ...@@ -308,6 +309,19 @@ void* GetCublasDsoHandle() {
#endif #endif
} }
void* GetCublasLtDsoHandle() {
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
#else
std::string warning_msg(
"Your CUDA_VERSION less 10.1, not support CublasLt. "
"If you want to use CublasLt, please upgrade CUDA and rebuild "
"PaddlePaddle.");
return nullptr;
#endif
}
void* GetCUDNNDsoHandle() { void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
std::string mac_warn_meg( std::string mac_warn_meg(
......
...@@ -26,6 +26,7 @@ namespace dynload { ...@@ -26,6 +26,7 @@ namespace dynload {
#endif #endif
void* GetCublasDsoHandle(); void* GetCublasDsoHandle();
void* GetCublasLtDsoHandle();
void* GetCUDNNDsoHandle(); void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle(); void* GetCUPTIDsoHandle();
void* GetCurandDsoHandle(); void* GetCurandDsoHandle();
......
...@@ -1997,10 +1997,12 @@ function gen_dockerfile() { ...@@ -1997,10 +1997,12 @@ function gen_dockerfile() {
DOCKERFILE_GPU_ENV="" DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO="" DOCKERFILE_CUDNN_DSO=""
DOCKERFILE_CUBLAS_DSO="" DOCKERFILE_CUBLAS_DSO=""
DOCKERFILE_CUBLASLT_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}" DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so" DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so" DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
DOCKERFILE_CUBLASLT_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so /usr/lib/x86_64-linux-gnu/libcublasLt.so"
fi fi
cat <<EOF cat <<EOF
...@@ -2090,6 +2092,7 @@ EOF ...@@ -2090,6 +2092,7 @@ EOF
ldconfig ldconfig
${DOCKERFILE_CUDNN_DSO} ${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_CUBLAS_DSO} ${DOCKERFILE_CUBLAS_DSO}
${DOCKERFILE_CUBLASLT_DSO}
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
EOF EOF
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -77,7 +77,7 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest): ...@@ -77,7 +77,7 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
self.check_output_with_option( self.check_output_with_option(
use_gpu, atol=1e-1, flatten=False, rtol=1e-1) use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue( self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
...@@ -87,7 +87,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXTest( ...@@ -87,7 +87,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXTest(
def set_params(self): def set_params(self):
self.transpose_x = True self.transpose_x = True
self.transpose_y = False self.transpose_y = False
self.alpha = 1.0 self.alpha = 2.1
class TensorRTMatMulQuantDequantDims3TransposeYTest( class TensorRTMatMulQuantDequantDims3TransposeYTest(
...@@ -95,7 +95,7 @@ class TensorRTMatMulQuantDequantDims3TransposeYTest( ...@@ -95,7 +95,7 @@ class TensorRTMatMulQuantDequantDims3TransposeYTest(
def set_params(self): def set_params(self):
self.transpose_x = False self.transpose_x = False
self.transpose_y = True self.transpose_y = True
self.alpha = 1.0 self.alpha = 3.9
class TensorRTMatMulQuantDequantDims3TransposeXYTest( class TensorRTMatMulQuantDequantDims3TransposeXYTest(
...@@ -103,7 +103,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXYTest( ...@@ -103,7 +103,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXYTest(
def set_params(self): def set_params(self):
self.transpose_x = True self.transpose_x = True
self.transpose_y = True self.transpose_y = True
self.alpha = 1.0 self.alpha = 8.4
class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest): class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
...@@ -163,7 +163,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest): ...@@ -163,7 +163,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
self.check_output_with_option( self.check_output_with_option(
use_gpu, atol=1e-1, flatten=False, rtol=1e-1) use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue( self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
...@@ -173,7 +173,7 @@ class TensorRTMatMulQuantDequantDims4TransposeXTest( ...@@ -173,7 +173,7 @@ class TensorRTMatMulQuantDequantDims4TransposeXTest(
def set_params(self): def set_params(self):
self.transpose_x = True self.transpose_x = True
self.transpose_y = False self.transpose_y = False
self.alpha = 1.0 self.alpha = 3.2
class TensorRTMatMulQuantDequantDims4TransposeYTest( class TensorRTMatMulQuantDequantDims4TransposeYTest(
...@@ -181,7 +181,7 @@ class TensorRTMatMulQuantDequantDims4TransposeYTest( ...@@ -181,7 +181,7 @@ class TensorRTMatMulQuantDequantDims4TransposeYTest(
def set_params(self): def set_params(self):
self.transpose_x = False self.transpose_x = False
self.transpose_y = True self.transpose_y = True
self.alpha = 1.0 self.alpha = 7.5
class TensorRTMatMulQuantDequantDims4TransposeXYTest( class TensorRTMatMulQuantDequantDims4TransposeXYTest(
...@@ -189,16 +189,97 @@ class TensorRTMatMulQuantDequantDims4TransposeXYTest( ...@@ -189,16 +189,97 @@ class TensorRTMatMulQuantDequantDims4TransposeXYTest(
def set_params(self): def set_params(self):
self.transpose_x = True self.transpose_x = True
self.transpose_y = True self.transpose_y = True
self.alpha = 1.0 self.alpha = 11.2
class TensorRTMatMulQuantDequantDims4ScaleTest( class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
TensorRTMatMulQuantDequantDims4Test): def setUp(self):
self.set_params()
def network():
self.data = fluid.data(
name='data', shape=[-1, 28, 28], dtype='float32')
self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
matmul_out = fluid.layers.matmul(
x=self.data,
y=self.data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
alpha=self.alpha)
out = fluid.layers.batch_norm(matmul_out, is_test=True)
fc_out = fluid.layers.fc(input=matmul_out,
size=10,
num_flatten_dims=1,
bias_attr=False,
act=None)
result = fluid.layers.relu(fc_out)
loss = fluid.layers.cross_entropy(input=result, label=self.label)
avg_loss = fluid.layers.mean(loss)
return avg_loss, result
self.main_program.random_seed = 2
self.startup_program.random_seed = 2
self.test_main_program.random_seed = 2
#self.test_startup_program.random_seed = 2
with fluid.unique_name.guard():
with fluid.program_guard(self.main_program, self.startup_program):
self.loss, result = network()
opt = fluid.optimizer.Adam(learning_rate=0.0001)
opt.minimize(self.loss)
with fluid.unique_name.guard():
with fluid.program_guard(self.test_main_program,
self.startup_program):
network()
self.feeds = {"data": np.random.random([3, 28, 28]).astype("float32")}
self.fetch_list = [result]
self.enable_trt = True
self.trt_parameters = TensorRTMatMulQuantDequantDims3DynamicTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False)
self.dynamic_shape_params = TensorRTMatMulQuantDequantDims3DynamicTest.DynamicShapeParam(
{
'data': [1, 28, 28]
}, {'data': [4, 28, 28]}, {'data': [3, 28, 28]}, False)
self.activation_quantize_type = 'moving_average_abs_max'
self.weight_quantize_type = 'channel_wise_abs_max'
def set_params(self): def set_params(self):
self.transpose_x = False self.transpose_x = False
self.transpose_y = False self.transpose_y = False
self.alpha = 1.0
def test_check_output(self):
#self.quant_dequant()
if core.is_compiled_with_cuda():
use_gpu = True
self.check_output_with_option(
use_gpu, atol=1, flatten=False, rtol=1e-1)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
class TensorRTMatMulQuantDequantDims4TransposeXDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = True
self.transpose_y = False
self.alpha = 2.0 self.alpha = 2.0
class TensorRTMatMulQuantDequantDims4TransposeYDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = False
self.transpose_y = True
self.alpha = 2.2
class TensorRTMatMulQuantDequantDims4TransposeXYDynamicTest(
TensorRTMatMulQuantDequantDims3DynamicTest):
def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 7.8
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册