[Paddle-Inference] Matmul_int8_convert: tensor*tensor (#37285)

* matmul_convert_int8 * matmul_convert_int8 * matmulconvert_int8 * Matmul_int8_convert: tensor*tensor * Matmul_int8_convert: tensor*tensor * Matmul_int8_convert: tensor*tensor

[Paddle-Inference] Matmul_int8_convert: tensor*tensor (#37285)
* matmul_convert_int8 * matmul_convert_int8 * matmulconvert_int8 * Matmul_int8_convert: tensor*tensor * Matmul_int8_convert: tensor*tensor * Matmul_int8_convert: tensor*tensor
16590799 · Wangzheee · GitHub · 025053b4 · 16590799 · 16590799
11 changed file
--- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"

 namespace paddle {
 namespace framework {
@@ -35,16 +36,26 @@ class MatMulOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid matmul op to tensorrt mul layer without bias";
-
+    VLOG(3) << "convert a fluid matmul op to tensorrt matmul layer ";
    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);

+    nvinfer1::Dims dims_x = input1->getDimensions();
+    nvinfer1::Dims dims_y = input2->getDimensions();
+
    bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
    bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));

+    auto output_name = op_desc.Output("Out")[0];
+    float alpha = 1;
+    if (op_desc.HasAttr("alpha")) {
+      float alpha_tem = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
+      alpha = alpha_tem;
+    }
    nvinfer1::MatrixOperation matrix_operation_X =
        transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
                    : nvinfer1::MatrixOperation::kNONE;
@@ -52,82 +63,122 @@ class MatMulOpConverter : public OpConverter {
        transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
                    : nvinfer1::MatrixOperation::kNONE;

-    auto* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1,
-                             matrix_operation_X, *input2, matrix_operation_Y);
-
-    float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
-    auto output_name = op_desc.Output("Out")[0];
-    if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-      engine_->SetITensor(output_name, layer->getOutput(0));
-    } else {
-      // IScaleLayer requires the input must have at least
-      // three dimensions in static shape mode and at least
-      // four dimensions in dynamic shape mode.
-      auto* matmul_out = layer->getOutput(0);
-      nvinfer1::Dims out_shape = matmul_out->getDimensions();
-      const int out_dims = out_shape.nbDims;
-      bool need_change_dim = false;
-
+    if (op_desc.HasAttr("support_int8") &&
+        engine_->precision() == AnalysisConfig::Precision::kInt8) {
      if (engine_->with_dynamic_shape()) {
-        if (out_dims == 3) {
-          need_change_dim = true;
-        }
+        VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT "
+                   "MatmulPluginLayer";
+        plugin::MatmulPluginDynamic* plugin =
+            new plugin::MatmulPluginDynamic(transpose_X, transpose_Y, alpha);
+        std::vector<nvinfer1::ITensor*> inputs{input1, input2};
+        layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+        RreplenishLayerAndOutput(layer, "matmul_op_int8_dynamic", {output_name},
+                                 test_mode);
      } else {
-        if (out_dims == 2) {
-          need_change_dim = true;
-        }
+        VLOG(3) << "Convert a fluid matmul_op_int8_static to TensorRT "
+                   "MatmulPluginLayer";
+        plugin::MatmulPlugin* plugin = new plugin::MatmulPlugin(
+            dims_x, dims_y, transpose_X, transpose_Y, alpha);
+        std::vector<nvinfer1::ITensor*> inputs{input1, input2};
+        layer = engine_->AddPluginV2IOExt(inputs.data(), inputs.size(), plugin);
+        RreplenishLayerAndOutput(layer, "matmul_op_int8_static", {output_name},
+                                 test_mode);
      }
-
-      if (need_change_dim) {
-        nvinfer1::Dims reshape_dim;
-        reshape_dim.nbDims = out_dims + 1;
-        reshape_dim.d[out_dims] = 1;
-        for (int i = 0; i < out_dims; i++) {
-          reshape_dim.d[i] = out_shape.d[i];
+    } else {
+      VLOG(3) << "Convert a fluid matmul_op_float to TensorRT ";
+      layer =
+          TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1,
+                               matrix_operation_X, *input2, matrix_operation_Y);
+      if (alpha == 1) {
+        RreplenishLayerAndOutput(layer, "matmul_op_float_no_alpha",
+                                 {output_name}, test_mode);
+      } else {
+        layer->setName(
+            ("matmul_op_float_has_alpha: MatrixMultiplyLayer (Output: " +
+             output_name + ")")
+                .c_str());
+        // IScaleLayer requires the input must have at least
+        // three dimensions in static shape mode and at least
+        // four dimensions in dynamic shape mode.
+        auto* matmul_out = layer->getOutput(0);
+        nvinfer1::Dims out_shape = matmul_out->getDimensions();
+        const int out_dims = out_shape.nbDims;
+        bool need_change_dim = false;
+
+        if (engine_->with_dynamic_shape()) {
+          if (out_dims == 3) {
+            need_change_dim = true;
+          }
+        } else {
+          if (out_dims == 2) {
+            need_change_dim = true;
+          }
        }

-        auto* reshape_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
-        reshape_layer->setReshapeDimensions(reshape_dim);
-        matmul_out = reshape_layer->getOutput(0);
-      }
+        if (need_change_dim) {
+          nvinfer1::Dims reshape_dim;
+          reshape_dim.nbDims = out_dims + 1;
+          reshape_dim.d[out_dims] = 1;
+          for (int i = 0; i < out_dims; i++) {
+            reshape_dim.d[i] = out_shape.d[i];
+          }
+
+          auto* reshape_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
+          reshape_layer->setReshapeDimensions(reshape_dim);
+          matmul_out = reshape_layer->getOutput(0);
+          reshape_layer->setName(("matmul_op_float_has_alpha_reshape_before: "
+                                  "ShuffleLayer (Output: " +
+                                  output_name + ")")
+                                     .c_str());
+        }

-      auto create_weights = [&](float data, const std::string& type) -> float* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
-        tmp_tensor->Resize({1});
-        auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
-        tmp_data[0] = data;
-        engine_->SetWeights(output_name + "_add_scale_op_" + type,
-                            std::move(tmp_tensor));
-        return tmp_data;
-      };
-      float* alpha_data = create_weights(alpha, "alpha");
-      float* shift_data = create_weights(0.0, "shift");
-      float* power_data = create_weights(1.0, "power");
-      TensorRTEngine::Weight nv_alpha{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(alpha_data), 1};
-      TensorRTEngine::Weight nv_shift{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(shift_data), 1};
-      TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(power_data), 1};
-      auto* scale_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
-          nv_shift.get(), nv_alpha.get(), nv_power.get());
-      auto* scale_out = scale_layer->getOutput(0);
-
-      if (need_change_dim) {
-        auto* reshape_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
-        reshape_layer->setReshapeDimensions(out_shape);
-        scale_out = reshape_layer->getOutput(0);
+        auto create_weights = [&](float data,
+                                  const std::string& type) -> float* {
+          std::unique_ptr<framework::Tensor> tmp_tensor(
+              new framework::Tensor());
+          tmp_tensor->Resize({1});
+          auto* tmp_data =
+              tmp_tensor->mutable_data<float>(platform::CPUPlace());
+          tmp_data[0] = data;
+          engine_->SetWeights(output_name + "_add_scale_op_" + type,
+                              std::move(tmp_tensor));
+          return tmp_data;
+        };
+        float* alpha_data = create_weights(alpha, "alpha");
+        float* shift_data = create_weights(0.0, "shift");
+        float* power_data = create_weights(1.0, "power");
+        TensorRTEngine::Weight nv_alpha{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(alpha_data), 1};
+        TensorRTEngine::Weight nv_shift{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(shift_data), 1};
+        TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(power_data), 1};
+        auto* scale_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
+            nv_shift.get(), nv_alpha.get(), nv_power.get());
+        auto* scale_out = scale_layer->getOutput(0);
+        scale_layer->setName(
+            ("matmul_op_float_has_alpha: ScaleLayer (Output: " + output_name +
+             ")")
+                .c_str());
+
+        if (need_change_dim) {
+          auto* reshape_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
+          reshape_layer->setReshapeDimensions(out_shape);
+          scale_out = reshape_layer->getOutput(0);
+          reshape_layer->setName(("matmul_op_float_has_alpha_reshape_after: "
+                                  "ShuffleLayer (Output: " +
+                                  output_name + ")")
+                                     .c_str());
+        }
+        engine_->SetITensor(output_name, scale_out);
+        if (test_mode) {  // the test framework can not determine which is the
+                          // output, so place the declaration inside.
+          engine_->DeclareOutput(output_name);
+        }
      }
-
-      engine_->SetITensor(output_name, scale_out);
-    }
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
    }
  }
 };

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -12,7 +12,8 @@ nv_library(tensorrt_plugin
           mish_op_plugin.cu
           pool3d_op_plugin.cu
           deformable_conv_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+           matmul_op_int8_plugin.cu
+	   DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)

 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cassert>
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class MatmulPlugin : public nvinfer1::IPluginV2IOExt {
+ public:
+  MatmulPlugin(nvinfer1::Dims const& dims_x, nvinfer1::Dims const& dims_y,
+               bool transA, bool transB, float alpha)
+      : dims_x_(dims_x),
+        dims_y_(dims_y),
+        transB_(transA),
+        transA_(transB),
+        alpha_(alpha) {}
+
+  MatmulPlugin(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &dims_x_);
+    DeserializeValue(&serial_data, &serial_length, &dims_y_);
+    DeserializeValue(&serial_data, &serial_length, &transB_);
+    DeserializeValue(&serial_data, &serial_length, &transA_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_one_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
+    DeserializeValue(&serial_data, &serial_length, &batch_);
+    DeserializeValue(&serial_data, &serial_length, &k_);
+    DeserializeValue(&serial_data, &serial_length, &m_);
+    DeserializeValue(&serial_data, &serial_length, &n_);
+    DeserializeValue(&serial_data, &serial_length, &cublas_);
+    DeserializeValue(&serial_data, &serial_length, &type_);
+    DeserializeValue(&serial_data, &serial_length, &Adesc_);
+    DeserializeValue(&serial_data, &serial_length, &Bdesc_);
+    DeserializeValue(&serial_data, &serial_length, &Cdesc_);
+    DeserializeValue(&serial_data, &serial_length, &AtransformDesc_);
+    DeserializeValue(&serial_data, &serial_length, &BtransformDesc_);
+    DeserializeValue(&serial_data, &serial_length, &CtransformDesc_);
+    DeserializeValue(&serial_data, &serial_length, &Atransform_);
+    DeserializeValue(&serial_data, &serial_length, &Btransform_);
+    DeserializeValue(&serial_data, &serial_length, &Ctransform_);
+    DeserializeValue(&serial_data, &serial_length, &transformDescT_);
+    DeserializeValue(&serial_data, &serial_length, &transformDescN_);
+    DeserializeValue(&serial_data, &serial_length, &matmulDesc_);
+  }
+
+  virtual bool isOutputBroadcastAcrossBatch(
+      int32_t output_index, const bool* input_is_broadcasted,
+      int32_t nb_inputs) const TRT_NOEXCEPT {
+    return false;
+  }
+
+  virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const
+      TRT_NOEXCEPT {
+    return false;
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  size_t getWorkspaceSize(int) const TRT_NOEXCEPT override { return 0; }
+
+  void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
+    name_space_ = plugin_namespace;
+  }
+
+  nvinfer1::IPluginV2IOExt* clone() const TRT_NOEXCEPT override {
+    MatmulPlugin* ptr =
+        new MatmulPlugin(dims_x_, dims_y_, transB_, transA_, alpha_);
+    ptr->setPluginNamespace(this->getPluginNamespace());
+    ptr->batch_ = batch_;
+    ptr->k_ = k_;
+    ptr->m_ = m_;
+    ptr->n_ = n_;
+    ptr->alpha_scale_ = alpha_scale_;
+    ptr->alpha_one_ = alpha_one_;
+    ptr->alpha_zero_ = alpha_zero_;
+    ptr->cublas_ = cublas_;
+    ptr->type_ = type_;
+    ptr->Adesc_ = Adesc_;
+    ptr->Bdesc_ = Bdesc_;
+    ptr->Cdesc_ = Cdesc_;
+    ptr->AtransformDesc_ = AtransformDesc_;
+    ptr->BtransformDesc_ = BtransformDesc_;
+    ptr->CtransformDesc_ = CtransformDesc_;
+    ptr->Atransform_ = Atransform_;
+    ptr->Btransform_ = Btransform_;
+    ptr->Ctransform_ = Ctransform_;
+    ptr->transformDescT_ = transformDescT_;
+    ptr->transformDescN_ = transformDescN_;
+    ptr->matmulDesc_ = matmulDesc_;
+    return ptr;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return name_space_.c_str();
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "matmul_int8_plugin";
+  }
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims* input_dims,
+                                     int num_inputs) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int32_t pos,
+                                 nvinfer1::PluginTensorDesc const* inOut,
+                                 int32_t nbInputs,
+                                 int32_t nbOutputs) const TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::PluginTensorDesc* in, int32_t nbInputs,
+                       const nvinfer1::PluginTensorDesc* out,
+                       int32_t nbOutputs) TRT_NOEXCEPT override;
+
+  /*
+    bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+        const TRT_NOEXCEPT override;
+  */
+  int initialize() TRT_NOEXCEPT { return 0; }
+  void terminate() TRT_NOEXCEPT;
+
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+#else
+  int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+  void detachFromContext() TRT_NOEXCEPT override;
+
+ protected:
+  nvinfer1::Dims dims_x_;
+  nvinfer1::Dims dims_y_;
+  bool transB_;
+  bool transA_;
+  float alpha_;
+  void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
+  int batch_;
+  uint64_t k_;
+  uint64_t m_;
+  uint64_t n_;
+  cublasLtHandle_t cublas_{nullptr};
+  nvinfer1::DataType type_;
+  cublasLtMatrixLayout_t Adesc_{nullptr}, Bdesc_{nullptr}, Cdesc_{nullptr};
+  cublasLtMatrixLayout_t AtransformDesc_{nullptr}, BtransformDesc_{nullptr},
+      CtransformDesc_{nullptr};
+  int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
+  cublasLtMatrixTransformDesc_t transformDescT_{nullptr},
+      transformDescN_{nullptr};
+  cublasLtMatmulDesc_t matmulDesc_{nullptr};
+  std::string name_space_;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return SerializedSize(dims_x_) + SerializedSize(dims_y_) +
+           SerializedSize(transB_) + SerializedSize(transA_) +
+           SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
+           SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
+           SerializedSize(batch_) + SerializedSize(k_) + SerializedSize(m_) +
+           SerializedSize(n_) + SerializedSize(cublas_) +
+           SerializedSize(type_) + SerializedSize(Adesc_) +
+           SerializedSize(Bdesc_) + SerializedSize(Cdesc_) +
+           SerializedSize(AtransformDesc_) + SerializedSize(BtransformDesc_) +
+           SerializedSize(CtransformDesc_) + SerializedSize(Atransform_) +
+           SerializedSize(Btransform_) + SerializedSize(Ctransform_) +
+           SerializedSize(transformDescT_) + SerializedSize(transformDescN_) +
+           SerializedSize(matmulDesc_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    SerializeValue(&buffer, dims_x_);
+    SerializeValue(&buffer, dims_y_);
+    SerializeValue(&buffer, transB_);
+    SerializeValue(&buffer, transA_);
+    SerializeValue(&buffer, alpha_);
+    SerializeValue(&buffer, alpha_scale_);
+    SerializeValue(&buffer, alpha_one_);
+    SerializeValue(&buffer, alpha_zero_);
+    SerializeValue(&buffer, batch_);
+    SerializeValue(&buffer, k_);
+    SerializeValue(&buffer, m_);
+    SerializeValue(&buffer, n_);
+    SerializeValue(&buffer, cublas_);
+    SerializeValue(&buffer, type_);
+    SerializeValue(&buffer, Adesc_);
+    SerializeValue(&buffer, Bdesc_);
+    SerializeValue(&buffer, Cdesc_);
+    SerializeValue(&buffer, AtransformDesc_);
+    SerializeValue(&buffer, BtransformDesc_);
+    SerializeValue(&buffer, CtransformDesc_);
+    SerializeValue(&buffer, Atransform_);
+    SerializeValue(&buffer, Btransform_);
+    SerializeValue(&buffer, Ctransform_);
+    SerializeValue(&buffer, transformDescT_);
+    SerializeValue(&buffer, transformDescN_);
+    SerializeValue(&buffer, matmulDesc_);
+  }
+};
+
+class MatmulPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  MatmulPluginCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "matmul_int8_plugin";
+  }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2IOExt* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2IOExt* deserializePlugin(
+      const char* name, void const* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    MatmulPlugin* obj = new MatmulPlugin(serial_data, serial_length);
+    obj->setPluginNamespace(name);
+    return obj;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(MatmulPluginCreator);
+
+#if IS_TRT_VERSION_GE(6000)
+class MatmulPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  MatmulPluginDynamic(bool transA, bool transB, float alpha)
+      : transB_(transA), transA_(transB), alpha_(alpha) {}
+
+  MatmulPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &transB_);
+    DeserializeValue(&serial_data, &serial_length, &transA_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_scale_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_one_);
+    DeserializeValue(&serial_data, &serial_length, &alpha_zero_);
+    DeserializeValue(&serial_data, &serial_length, &cublas_);
+    DeserializeValue(&serial_data, &serial_length, &Atransform_);
+    DeserializeValue(&serial_data, &serial_length, &Btransform_);
+    DeserializeValue(&serial_data, &serial_length, &Ctransform_);
+    DeserializeValue(&serial_data, &serial_length, &type_);
+  }
+
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    MatmulPluginDynamic* ptr =
+        new MatmulPluginDynamic(transB_, transA_, alpha_);
+    ptr->setPluginNamespace(this->getPluginNamespace());
+    ptr->alpha_scale_ = alpha_scale_;
+    ptr->alpha_one_ = alpha_one_;
+    ptr->alpha_zero_ = alpha_zero_;
+    ptr->cublas_ = cublas_;
+    ptr->Atransform_ = Atransform_;
+    ptr->Btransform_ = Btransform_;
+    ptr->Ctransform_ = Ctransform_;
+    ptr->type_ = type_;
+    return ptr;
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "matmul_int8_dynamic_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  int initialize() TRT_NOEXCEPT { return 0; }
+  void terminate() TRT_NOEXCEPT;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ protected:
+  bool transB_;
+  bool transA_;
+  float alpha_;
+  void *alpha_scale_{nullptr}, *alpha_one_{nullptr}, *alpha_zero_{nullptr};
+  cublasLtHandle_t cublas_{nullptr};
+  nvinfer1::DataType type_;
+  int8_t *Atransform_{nullptr}, *Btransform_{nullptr}, *Ctransform_{nullptr};
+  std::string name_space_;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return SerializedSize(transB_) + SerializedSize(transA_) +
+           SerializedSize(alpha_) + SerializedSize(alpha_scale_) +
+           SerializedSize(alpha_one_) + SerializedSize(alpha_zero_) +
+           SerializedSize(Atransform_) + SerializedSize(Btransform_) +
+           SerializedSize(Ctransform_) + SerializedSize(cublas_) +
+           SerializedSize(type_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    SerializeValue(&buffer, transB_);
+    SerializeValue(&buffer, transA_);
+    SerializeValue(&buffer, alpha_);
+    SerializeValue(&buffer, alpha_scale_);
+    SerializeValue(&buffer, alpha_one_);
+    SerializeValue(&buffer, alpha_zero_);
+    SerializeValue(&buffer, Atransform_);
+    SerializeValue(&buffer, Btransform_);
+    SerializeValue(&buffer, Ctransform_);
+    SerializeValue(&buffer, cublas_);
+    SerializeValue(&buffer, type_);
+  }
+};
+
+class MatmulPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  MatmulPluginDynamicCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "matmul_int8_dynamic_plugin";
+  }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, void const* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    MatmulPluginDynamic* obj =
+        new MatmulPluginDynamic(serial_data, serial_length);
+    obj->setPluginNamespace(name);
+    return obj;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(MatmulPluginDynamicCreator);
+#endif
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)

 if (NOT WITH_NV_JETSON)
    list(APPEND CUDA_SRCS nvjpeg.cc)

--- a/paddle/fluid/platform/dynload/cublasLt.cc
+++ b/paddle/fluid/platform/dynload/cublasLt.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublasLt_dso_flag;
+void *cublasLt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublasLt.h>
+#include <cuda.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cublasLt_dso_flag;
+extern void *cublasLt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublasLt routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using cublasLt_func =                                                 \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(cublasLt_dso_flag, []() {                              \
+        cublasLt_dso_handle =                                               \
+            paddle::platform::dynload::GetCublasLtDsoHandle();              \
+      });                                                                   \
+      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
+      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+// APIs available after CUDA 10.1
+// #if CUDA_VERSION >= 10100
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)    \
+  __macro(cublasLtCreate);                     \
+  __macro(cublasLtDestroy);                    \
+  __macro(cublasLtMatmul);                     \
+  __macro(cublasLtMatmulDescCreate);           \
+  __macro(cublasLtMatmulDescDestroy);          \
+  __macro(cublasLtMatmulDescSetAttribute);     \
+  __macro(cublasLtMatrixLayoutCreate);         \
+  __macro(cublasLtMatrixLayoutDestroy);        \
+  __macro(cublasLtMatrixLayoutSetAttribute);   \
+  __macro(cublasLtMatrixTransform);            \
+  __macro(cublasLtMatrixTransformDescCreate);  \
+  __macro(cublasLtMatrixTransformDescDestroy); \
+  __macro(cublasLtMatrixTransformDescSetAttribute);
+
+CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
+// #endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -30,10 +30,11 @@ DEFINE_string(cudnn_dir, "",
              "/usr/local/cudnn/lib. If empty [default], dlopen "
              "will search cudnn from LD_LIBRARY_PATH");

-DEFINE_string(cuda_dir, "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+DEFINE_string(
+    cuda_dir, "",
+    "Specify path for loading cuda library, such as libcublas, libcublasLt "
+    "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
+    "If default, dlopen will search cuda from LD_LIBRARY_PATH");

 DEFINE_string(nccl_dir, "",
              "Specify path for loading nccl library, such as libnccl.so. "
@@ -308,6 +309,19 @@ void* GetCublasDsoHandle() {
 #endif
 }

+void* GetCublasLtDsoHandle() {
+// APIs available after CUDA 10.1
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 10.1, not support CublasLt. "
+      "If you want to use CublasLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  std::string mac_warn_meg(

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -26,6 +26,7 @@ namespace dynload {
 #endif

 void* GetCublasDsoHandle();
+void* GetCublasLtDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1997,10 +1997,12 @@ function gen_dockerfile() {
    DOCKERFILE_GPU_ENV=""
    DOCKERFILE_CUDNN_DSO=""
    DOCKERFILE_CUBLAS_DSO=""
+    DOCKERFILE_CUBLASLT_DSO=""
    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
        DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
        DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
+        DOCKERFILE_CUBLASLT_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so /usr/lib/x86_64-linux-gnu/libcublasLt.so"
    fi

    cat <<EOF
@@ -2090,6 +2092,7 @@ EOF
        ldconfig
    ${DOCKERFILE_CUDNN_DSO}
    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_CUBLASLT_DSO}
    ${DOCKERFILE_GPU_ENV}
 EOF
    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+                use_gpu, atol=1, flatten=False, rtol=1e-1)
            self.assertTrue(
                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))

@@ -87,7 +87,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXTest(
    def set_params(self):
        self.transpose_x = True
        self.transpose_y = False
-        self.alpha = 1.0
+        self.alpha = 2.1


 class TensorRTMatMulQuantDequantDims3TransposeYTest(
@@ -95,7 +95,7 @@ class TensorRTMatMulQuantDequantDims3TransposeYTest(
    def set_params(self):
        self.transpose_x = False
        self.transpose_y = True
-        self.alpha = 1.0
+        self.alpha = 3.9


 class TensorRTMatMulQuantDequantDims3TransposeXYTest(
@@ -103,7 +103,7 @@ class TensorRTMatMulQuantDequantDims3TransposeXYTest(
    def set_params(self):
        self.transpose_x = True
        self.transpose_y = True
-        self.alpha = 1.0
+        self.alpha = 8.4


 class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
@@ -163,7 +163,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+                use_gpu, atol=1, flatten=False, rtol=1e-1)
            self.assertTrue(
                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))

@@ -173,7 +173,7 @@ class TensorRTMatMulQuantDequantDims4TransposeXTest(
    def set_params(self):
        self.transpose_x = True
        self.transpose_y = False
-        self.alpha = 1.0
+        self.alpha = 3.2


 class TensorRTMatMulQuantDequantDims4TransposeYTest(
@@ -181,7 +181,7 @@ class TensorRTMatMulQuantDequantDims4TransposeYTest(
    def set_params(self):
        self.transpose_x = False
        self.transpose_y = True
-        self.alpha = 1.0
+        self.alpha = 7.5


 class TensorRTMatMulQuantDequantDims4TransposeXYTest(
@@ -189,16 +189,97 @@ class TensorRTMatMulQuantDequantDims4TransposeXYTest(
    def set_params(self):
        self.transpose_x = True
        self.transpose_y = True
-        self.alpha = 1.0
+        self.alpha = 11.2


-class TensorRTMatMulQuantDequantDims4ScaleTest(
-        TensorRTMatMulQuantDequantDims4Test):
+class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
+    def setUp(self):
+        self.set_params()
+
+        def network():
+            self.data = fluid.data(
+                name='data', shape=[-1, 28, 28], dtype='float32')
+            self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
+            matmul_out = fluid.layers.matmul(
+                x=self.data,
+                y=self.data,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
+            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            fc_out = fluid.layers.fc(input=matmul_out,
+                                     size=10,
+                                     num_flatten_dims=1,
+                                     bias_attr=False,
+                                     act=None)
+            result = fluid.layers.relu(fc_out)
+            loss = fluid.layers.cross_entropy(input=result, label=self.label)
+            avg_loss = fluid.layers.mean(loss)
+            return avg_loss, result
+
+        self.main_program.random_seed = 2
+        self.startup_program.random_seed = 2
+        self.test_main_program.random_seed = 2
+        #self.test_startup_program.random_seed = 2
+        with fluid.unique_name.guard():
+            with fluid.program_guard(self.main_program, self.startup_program):
+                self.loss, result = network()
+                opt = fluid.optimizer.Adam(learning_rate=0.0001)
+                opt.minimize(self.loss)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(self.test_main_program,
+                                     self.startup_program):
+                network()
+        self.feeds = {"data": np.random.random([3, 28, 28]).astype("float32")}
+        self.fetch_list = [result]
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulQuantDequantDims3DynamicTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False)
+        self.dynamic_shape_params = TensorRTMatMulQuantDequantDims3DynamicTest.DynamicShapeParam(
+            {
+                'data': [1, 28, 28]
+            }, {'data': [4, 28, 28]}, {'data': [3, 28, 28]}, False)
+        self.activation_quantize_type = 'moving_average_abs_max'
+        self.weight_quantize_type = 'channel_wise_abs_max'
+
    def set_params(self):
        self.transpose_x = False
        self.transpose_y = False
+        self.alpha = 1.0
+
+    def test_check_output(self):
+        #self.quant_dequant()
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(
+                use_gpu, atol=1, flatten=False, rtol=1e-1)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTMatMulQuantDequantDims4TransposeXDynamicTest(
+        TensorRTMatMulQuantDequantDims3DynamicTest):
+    def set_params(self):
+        self.transpose_x = True
+        self.transpose_y = False
        self.alpha = 2.0


+class TensorRTMatMulQuantDequantDims4TransposeYDynamicTest(
+        TensorRTMatMulQuantDequantDims3DynamicTest):
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = True
+        self.alpha = 2.2
+
+
+class TensorRTMatMulQuantDequantDims4TransposeXYDynamicTest(
+        TensorRTMatMulQuantDequantDims3DynamicTest):
+    def set_params(self):
+        self.transpose_x = True
+        self.transpose_y = True
+        self.alpha = 7.8
+
+
 if __name__ == "__main__":
    unittest.main()