Add INT8 support for fused_multi_transformer_op (#45284)

3d7e2118 · RichardWooSJTU · GitHub · 7f346a76 · 3d7e2118 · 3d7e2118
22 changed file
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -165,7 +165,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
        auto var_data_type = var_node->Var()->GetDataType();
        VLOG(5) << "var_name is " << var_name << ", data type is "
                << var_data_type;
-        if (var_data_type == paddle::framework::proto::VarType::FP16) {
+        if (var_data_type == paddle::framework::proto::VarType::FP16 &&
+            t->dtype() != paddle::experimental::DataType::FLOAT16) {
          framework::Tensor half_tensor;
          half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
          half_tensor.Resize(t->dims());

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -23,6 +23,7 @@ register_operators(
  fused_transformer_op
  fused_feedforward_op
  fused_multi_transformer_op
+  fused_multi_transformer_int8_op
  fused_bias_dropout_residual_layer_norm_op
  resnet_unit_op
  fused_gemm_epilogue_op
@@ -119,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
    # fused_attention_op
    op_library(fused_attention_op)
    op_library(fused_multi_transformer_op)
+    op_library(fused_multi_transformer_int8_op)
    op_library(fused_bias_dropout_residual_layer_norm_op)
  endif()
  # resnet_unit needs cudnn 8.0 above

--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -19,7 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+// NOTE: T must be the same as OutType in ComputeBackward
+template <typename T, typename InType = T, typename OutType = T>
 class AttnLayerNorm {
 public:
  AttnLayerNorm(const phi::GPUContext& dev_ctx,
@@ -33,17 +34,28 @@ class AttnLayerNorm {

  ~AttnLayerNorm() {}

-  void ComputeForward(const T* x_data,
+  void ComputeForward(const InType* x_data,
                      const LayerNormParamType<T>* scale_data,
                      const LayerNormParamType<T>* bias_data,
-                      T* y_data,
+                      OutType* y_data,
                      LayerNormParamType<T>* mean_data,
-                      LayerNormParamType<T>* var_data) {
+                      LayerNormParamType<T>* var_data,
+                      const float* dequant_out_scale_data = nullptr,
+                      const int quant_out_scale_offset = 0,
+                      const float quant_in_scale = 1.0,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
    auto stream = dev_ctx_.stream();

    switch (GetDesiredBlockDim(feature_size_)) {
      FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
+          LayerNormForward<T,
+                           LayerNormParamType<T>,
+                           kBlockDim,
+                           false,
+                           InType,
+                           OutType>
          <<<batch_size_, kBlockDim, 0, stream>>>(x_data,
                                                  scale_data,
                                                  bias_data,
@@ -51,7 +63,13 @@ class AttnLayerNorm {
                                                  mean_data,
                                                  var_data,
                                                  epsilon_,
-                                                  feature_size_));
+                                                  feature_size_,
+                                                  dequant_out_scale_data,
+                                                  quant_out_scale_offset,
+                                                  quant_in_scale,
+                                                  quant_round_type,
+                                                  quant_max_bound,
+                                                  quant_min_bound));
      default:
        PADDLE_THROW(platform::errors::InvalidArgument(
            "Feature_size must be larger than 1"));

--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/operators/fused/cublaslt.h"
+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class AttnMatmulINT8 {
+ public:
+  AttnMatmulINT8(
+      const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
+      : dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
+    auto helper = std::make_shared<CublasLtHelper>(m, k, n);
+    helpers_.emplace_back(helper);
+  }
+  ~AttnMatmulINT8() {}
+
+  // This function is used to execute GEMM, with input and output's types are
+  // both T.
+  void ComputeForward(const framework::Tensor* weight,
+                      const framework::Tensor* input,
+                      framework::Tensor* input_tmp,
+                      const framework::Tensor* bias,
+                      framework::Tensor* output,
+                      framework::Tensor* output_tmp,
+                      framework::Tensor* bias_out,
+                      const float quant_in_scale,
+                      const framework::Tensor* dequant_out_scale,
+                      const int quant_out_scale_offset,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
+    quantize_kernel_launcher<T>(input->data<T>(),
+                                input_tmp->data<int8_t>(),
+                                quant_in_scale,
+                                m_,
+                                k_,
+                                quant_round_type,
+                                quant_max_bound,
+                                quant_min_bound,
+                                dev_ctx_.stream());
+
+    helpers_[0]->GEMM(input_tmp->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output_tmp->data<int32_t>(),
+                      dev_ctx_.stream());
+
+    dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
+                                  output->data<T>(),
+                                  m_,
+                                  n_,
+                                  dev_ctx_.stream(),
+                                  quant_in_scale,
+                                  dequant_out_scale->data<float>(),
+                                  quant_out_scale_offset);
+
+    if (compute_bias_) {
+      // bias_out = output + bias
+      std::vector<const framework::Tensor*> ins = {output, bias};
+      std::vector<framework::Tensor*> outs = {bias_out};
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      PADDLE_ENFORCE_EQ(cudaGetLastError(),
+                        cudaSuccess,
+                        platform::errors::Fatal(
+                            "cuda error occured after computing bias. "
+                            "But it does not mean this error is caused by "
+                            "bias computing"));
+    }
+  }
+
+  // This function is used to execute GEMM, with input and output's types are
+  // both INT8.
+  void ComputeForwardINT8ToINT8(const framework::Tensor* weight,
+                                framework::Tensor* input,
+                                const framework::Tensor* bias,
+                                framework::Tensor* output,
+                                framework::Tensor* bias_out) {
+    helpers_[0]->GEMM(input->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output->data<int32_t>(),
+                      dev_ctx_.stream());
+  }
+
+  // This function is used to execute GEMM, with input and output's types are
+  // INT8 and T.
+  void ComputeForwardINT8ToT(const framework::Tensor* weight,
+                             const float quant_in_scale,
+                             framework::Tensor* input,
+                             const framework::Tensor* bias,
+                             framework::Tensor* output,
+                             framework::Tensor* output_tmp,
+                             framework::Tensor* bias_out,
+                             const framework::Tensor* dequant_out_scale,
+                             const int quant_out_scale_offset) {
+    helpers_[0]->GEMM(input->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output_tmp->data<int32_t>(),
+                      dev_ctx_.stream());
+
+    dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
+                                  output->data<T>(),
+                                  m_,
+                                  n_,
+                                  dev_ctx_.stream(),
+                                  quant_in_scale,
+                                  dequant_out_scale->data<float>(),
+                                  quant_out_scale_offset);
+
+    if (compute_bias_) {
+      // bias_out = output + bias
+      std::vector<const framework::Tensor*> ins = {output, bias};
+      std::vector<framework::Tensor*> outs = {bias_out};
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      PADDLE_ENFORCE_EQ(cudaGetLastError(),
+                        cudaSuccess,
+                        platform::errors::Fatal(
+                            "cuda error occured after computing bias. "
+                            "But it does not mean this error is caused by "
+                            "bias computing"));
+    }
+  }
+
+  // This function is used to execute GEMM, with input and output's types are T
+  // and INT8.
+  void ComputeForwardTToINT8(const framework::Tensor* weight,
+                             const float quant_in_scale,
+                             const framework::Tensor* input,
+                             framework::Tensor* input_tmp,
+                             const framework::Tensor* bias,
+                             framework::Tensor* output,
+                             framework::Tensor* bias_out,
+                             const int quant_round_type = 1,
+                             const float quant_max_bound = 127.0,
+                             const float quant_min_bound = -127.0) {
+    quantize_kernel_launcher<T>(input->data<T>(),
+                                input_tmp->data<int8_t>(),
+                                quant_in_scale,
+                                m_,
+                                k_,
+                                quant_round_type,
+                                quant_max_bound,
+                                quant_min_bound,
+                                dev_ctx_.stream());
+
+    helpers_[0]->GEMM(input_tmp->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output->data<int32_t>(),
+                      dev_ctx_.stream());
+  }
+
+ private:
+  const phi::GPUContext& dev_ctx_;
+
+  int m_;  // m
+  int n_;  // n
+  int k_;  // k
+
+  int compute_bias_;
+  std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/cublaslt.h
+++ b/paddle/fluid/operators/fused/cublaslt.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+
+namespace dyl = paddle::platform::dynload;
+
+namespace paddle {
+namespace operators {
+class CublasLtHelper {
+ public:
+  CublasLtHelper(int m, int k, int n)
+      : alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
+    cublasStatus_t status;
+    // handle and matmul desc
+    status = dyl::cublasLtCreate(&handle_);
+#if CUBLAS_VER_MAJOR < 11
+    cudaDataType_t cudaComputeType = CUDA_R_32I;
+#else
+    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+#endif
+
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+#if CUBLAS_VER_MAJOR < 11
+    status = dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType);
+#else
+    status = dyl::cublasLtMatmulDescCreate(
+        &matmul_desc_, cudaComputeType, CUDA_R_32I);
+#endif
+
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmulDescCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+    cublasOperation_t op_transpose = CUBLAS_OP_T;
+    status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
+                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                 &op_transpose,
+                                                 sizeof(op_transpose));
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmulDescSetAttribute execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    // matrix desc
+    status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, CUDA_R_8I, k, n, k);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, CUDA_R_8I, k, m, k);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, CUDA_R_32I, n, m, n);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+  }
+  ~CublasLtHelper() {
+    if (handle_) dyl::cublasLtDestroy(handle_);
+    if (matmul_desc_) dyl::cublasLtMatmulDescDestroy(matmul_desc_);
+    if (A_desc_) dyl::cublasLtMatrixLayoutDestroy(A_desc_);
+    if (B_desc_) dyl::cublasLtMatrixLayoutDestroy(B_desc_);
+    if (C_desc_) dyl::cublasLtMatrixLayoutDestroy(C_desc_);
+  }
+
+  void GEMM(int8_t* A_dev,
+            const int8_t* B_dev,
+            int32_t* C_dev,
+            cudaStream_t stream) {
+    cublasStatus_t status;
+
+#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
+    cublasLtMatmulAlgo_t algo;
+    int algoId = 21;
+    int swizzle = 0;
+    int customOption = 0;
+    int tile = 15;
+    int splitK_val = 0;
+    int reductionScheme = 0;
+#if CUDA_VERSION >= 11000
+    int stages = 23;
+#endif
+
+#if CUBLAS_VER_MAJOR < 11
+    cudaDataType_t cudaComputeType = CUDA_R_32I;
+#else
+    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+#endif
+
+    dyl::cublasLtMatmulAlgoInit(handle_,
+                                cudaComputeType,
+                                CUDA_R_32I,
+                                CUDA_R_8I,
+                                CUDA_R_8I,
+                                CUDA_R_32I,
+                                CUDA_R_32I,
+                                algoId,
+                                &algo);
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo,
+        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
+        &(customOption),
+        sizeof(customOption));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                              CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                              &(splitK_val),
+                                              sizeof(splitK_val));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo,
+        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+        &(reductionScheme),
+        sizeof(int));
+#if CUDA_VERSION >= 11000
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+#endif
+    status = dyl::cublasLtMatmul(handle_,
+                                 matmul_desc_,
+                                 &alpha_,
+                                 B_dev,
+                                 B_desc_,
+                                 A_dev,
+                                 A_desc_,
+                                 &beta_,
+                                 C_dev,
+                                 C_desc_,
+                                 C_dev,
+                                 C_desc_,
+#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
+                                 &algo,
+#else
+                                 nullptr,
+#endif
+                                 nullptr,
+                                 0,
+                                 stream);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmul execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+  }
+
+ private:
+  cublasLtHandle_t handle_;
+  cublasLtMatmulDesc_t matmul_desc_;
+  cublasLtMatrixLayout_t A_desc_;
+  cublasLtMatrixLayout_t B_desc_;
+  cublasLtMatrixLayout_t C_desc_;
+  int32_t alpha_;
+  int32_t beta_;
+
+  int m_;
+  int k_;
+  int n_;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -60,8 +60,14 @@ struct GeluGradFunctor {
 * the src, mask and dst shape is (rows, cols)
 * the bias shape is (1, cols)
 */
-template <typename T, typename MaskType, int VecSize, typename Functor>
-__global__ void FusedDropoutActBias(Functor act,
+template <typename T,
+          typename MaskType,
+          int VecSize,
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
+__global__ void FusedDropoutActBias(
+    Functor act,
    const uint64_t seed,
    const uint64_t rows,
    const uint64_t cols,
@@ -69,10 +75,17 @@ __global__ void FusedDropoutActBias(Functor act,
    const float dropout_prob,
    const bool is_upscale_in_train,
    const bool is_test,
-                                    const T *__restrict__ src,
+    const InType *__restrict__ src,
    const T *__restrict__ bias,
-                                    T *dst,
-                                    MaskType *mask) {
+    OutType *dst,
+    MaskType *mask,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
  int row_id = blockIdx.y;
  int idx = row_id * cols + col_id;
@@ -90,7 +103,9 @@ __global__ void FusedDropoutActBias(Functor act,
                                        VecSize,
                                        false,
                                        true,
-                                        Functor>(r,
+                                        Functor,
+                                        InType,
+                                        OutType>(r,
                                                 i,
                                                 cols,
                                                 &state,
@@ -104,7 +119,14 @@ __global__ void FusedDropoutActBias(Functor act,
                                                 is_test,
                                                 nullptr,
                                                 nullptr,
-                                                 act);
+                                                 act,
+                                                 quant_last_in_scale,
+                                                 dequant_out_scale_data,
+                                                 quant_out_scale_offset,
+                                                 quant_next_in_scale,
+                                                 quant_round_type,
+                                                 quant_max_bound,
+                                                 quant_min_bound);
    }
  }
 }
@@ -112,7 +134,11 @@ __global__ void FusedDropoutActBias(Functor act,
 /**
 * @brief dst = dropout(activation(src + bias));
 */
-template <typename T, typename MaskType, typename Functor>
+template <typename T,
+          typename MaskType,
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
 void LaunchDropoutActBias(Functor act_functor,
                          const uint64_t seed,
                          const uint32_t rows,
@@ -121,14 +147,21 @@ void LaunchDropoutActBias(Functor act_functor,
                          const float dropout_prob,
                          const bool is_upscale_in_train,
                          const bool is_test,
-                          const T *src,
+                          const InType *src,
                          const T *bias,
-                          T *dst,
+                          OutType *dst,
                          MaskType *mask_data,
-                          const phi::GPUContext &ctx) {
+                          const phi::GPUContext &ctx,
+                          const float quant_last_in_scale = 1.0,
+                          const float *dequant_out_scale_data = nullptr,
+                          const int quant_out_scale_offset = 0,
+                          const float quant_next_in_scale = 1.0,
+                          const int quant_round_type = 1,
+                          const float quant_max_bound = 127.0,
+                          const float quant_min_bound = -127.0) {
  // dropout_prob == 1.0f
  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    SetZero<T>(ctx, dst, rows * cols);
+    SetZero<T>(ctx, reinterpret_cast<T *>(dst), rows * cols);
    SetZero<MaskType>(ctx, mask_data, rows * cols);
    return;
  }
@@ -137,7 +170,7 @@ void LaunchDropoutActBias(Functor act_functor,
  const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
  const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
  if (cols % VecSize == 0) {
-    FusedDropoutActBias<T, MaskType, VecSize, Functor>
+    FusedDropoutActBias<T, MaskType, VecSize, Functor, InType, OutType>
        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
            act_functor,
            seed,
@@ -150,9 +183,13 @@ void LaunchDropoutActBias(Functor act_functor,
            src,
            bias,
            dst,
-            mask_data);
+            mask_data,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
  } else {
-    FusedDropoutActBias<T, MaskType, 1, Functor>
+    FusedDropoutActBias<T, MaskType, 1, Functor, InType, OutType>
        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
            act_functor,
            seed,
@@ -165,7 +202,11 @@ void LaunchDropoutActBias(Functor act_functor,
            src,
            bias,
            dst,
-            mask_data);
+            mask_data,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
  }
 }


--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"

--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -109,7 +109,10 @@ struct DropoutParam {
  }
 };

-template <typename T, typename MaskType>
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
 class FusedDropoutHelper {
 private:
  int GetIncrement(const phi::GPUContext& ctx) {
@@ -140,13 +143,18 @@ class FusedDropoutHelper {

  // out = residual + dropout( src + bias )
  void ResidualDropoutBias(const phi::GPUContext& ctx,
-                           const T* src,
+                           const InType* src,
                           const T* residual,
                           const T* bias,
-                           T* out,
-                           MaskType* mask) {
+                           OutType* out,
+                           MaskType* mask,
+                           const float quant_last_in_scale = 1.0,
+                           const float* dequant_out_scale_data = nullptr,
+                           const int quant_out_scale_offset = 0,
+                           const float quant_next_in_scale = 1.0) {
    auto increment = GetIncrement(ctx);
-    LaunchResidualDropoutBias<T, MaskType>(rows_,
+    LaunchResidualDropoutBias<T, MaskType, InType, OutType>(
+        rows_,
        cols_,
        increment,
        dropout_param_.seed,
@@ -158,7 +166,11 @@ class FusedDropoutHelper {
        bias,
        mask,
        out,
-                                           ctx);
+        ctx,
+        quant_last_in_scale,
+        dequant_out_scale_data,
+        quant_out_scale_offset,
+        quant_next_in_scale);
  }

  void ResidualDropoutBiasGrad(const phi::GPUContext& ctx,
@@ -189,15 +201,22 @@ class FusedDropoutHelper {

  // out = dropout(activation(src + bias))
  void DropoutActBias(const phi::GPUContext& ctx,
-                      const T* src,
+                      const InType* src,
                      const T* bias,
                      const std::string& act_method,
-                      T* out,
-                      MaskType* mask) {
+                      OutType* out,
+                      MaskType* mask,
+                      const float quant_last_in_scale = 1.0,
+                      const float* dequant_out_scale_data = nullptr,
+                      const int quant_out_scale_offset = 0,
+                      const float quant_next_in_scale = 1.0,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
    auto increment = GetIncrement(ctx);
    if (act_method == "gelu") {
      GeluFunctor<T> gelu;
-      LaunchDropoutActBias<T, MaskType, GeluFunctor<T>>(
+      LaunchDropoutActBias<T, MaskType, GeluFunctor<T>, InType, OutType>(
          gelu,
          dropout_param_.seed,
          rows_,
@@ -210,11 +229,21 @@ class FusedDropoutHelper {
          bias,
          out,
          mask,
-          ctx);
+          ctx,
+          quant_last_in_scale,
+          dequant_out_scale_data,
+          quant_out_scale_offset,
+          quant_next_in_scale,
+          quant_round_type,
+          quant_max_bound,
+          quant_min_bound);
    } else if (act_method == "relu") {
      phi::funcs::ReluFunctor<T> relu;
-      LaunchDropoutActBias<T, MaskType, phi::funcs::ReluFunctor<T>>(
-          relu,
+      LaunchDropoutActBias<T,
+                           MaskType,
+                           phi::funcs::ReluFunctor<T>,
+                           InType,
+                           OutType>(relu,
                                    dropout_param_.seed,
                                    rows_,
                                    cols_,
@@ -226,7 +255,14 @@ class FusedDropoutHelper {
                                    bias,
                                    out,
                                    mask,
-          ctx);
+                                    ctx,
+                                    quant_last_in_scale,
+                                    dequant_out_scale_data,
+                                    quant_out_scale_offset,
+                                    quant_next_in_scale,
+                                    quant_round_type,
+                                    quant_max_bound,
+                                    quant_min_bound);
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Currently only supports gelu or relu activation functions!"));
@@ -283,8 +319,12 @@ class FusedDropoutHelper {
  DropoutParam dropout_param_;
 };

-template <typename T, typename MaskType>
-class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
+class FusedDropoutLayerNormHelper
+    : public FusedDropoutHelper<T, MaskType, InType, OutType> {
 public:
  FusedDropoutLayerNormHelper() {}
  FusedDropoutLayerNormHelper(const int rows,
@@ -301,23 +341,24 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
                              const int cols,
                              const DropoutParam& dropout_param,
                              const float epsilon)
-      : FusedDropoutHelper<T, MaskType>(ctx, rows, cols, dropout_param) {
+      : FusedDropoutHelper<T, MaskType, InType, OutType>(
+            ctx, rows, cols, dropout_param) {
    using U = LayerNormParamType<T>;
    epsilon_ = epsilon;
  }

  // call layer_norm
  void LayerNorm(const phi::GPUContext& ctx,
-                 const T* src,
+                 const InType* src,
                 const LayerNormParamType<T>* gamma,
                 const LayerNormParamType<T>* beta,
-                 T* out,
+                 OutType* out,
                 LayerNormParamType<T>* mean,
                 LayerNormParamType<T>* variance) {
    using U = LayerNormParamType<T>;
    switch (GetDesiredBlockDim(this->cols_)) {
      FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, U, kBlockDim>
+          LayerNormForward<T, U, kBlockDim, false, InType, OutType>
          <<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
              src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
    }
@@ -349,17 +390,25 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {

  // out = layernorm(residual + dropout(src + bias))
  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
-  void LayernormResidualDropoutBias(const phi::GPUContext& ctx,
-                                    const T* src,
+  void LayernormResidualDropoutBias(
+      const phi::GPUContext& ctx,
+      const InType* src,
      const T* residual,
      const T* bias,
      const P* gamma,
      const P* beta,
      T* dropout_out,
      MaskType* mask,
-                                    T* out,
+      OutType* out,
      LayerNormParamType<T>* mean,
-                                    LayerNormParamType<T>* variance) {
+      LayerNormParamType<T>* variance,
+      const float quant_last_in_scale = 1.0,
+      const float* dequant_out_scale_data = nullptr,
+      const int quant_out_scale_offset = 0,
+      const float quant_next_in_scale = 1.0,
+      const int quant_round_type = 1,
+      const float quant_max_bound = 127.0,
+      const float quant_min_bound = -127.0) {
    using U = LayerNormParamType<T>;
    int vec_size = MAX_CACHE_BYTES / sizeof(T);
    if (this->cols_ % vec_size != 0) {
@@ -368,7 +417,12 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
    int threads = GetDesiredBlockDim(this->cols_ / vec_size);
    int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
    increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
-    LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
+    LaunchLayernormResidualDropoutBias<T,
+                                       MaskType,
+                                       U,
+                                       is_same_type,
+                                       InType,
+                                       OutType>(
        this->rows_,
        this->cols_,
        increment,
@@ -387,7 +441,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
        out,
        mean,
        variance,
-        ctx);
+        ctx,
+        quant_last_in_scale,
+        dequant_out_scale_data,
+        quant_out_scale_offset,
+        quant_next_in_scale,
+        quant_round_type,
+        quant_max_bound,
+        quant_min_bound);
  }

  template <typename P = LayerNormParamType<T>, bool is_same_type = false>

--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -418,7 +418,9 @@ template <typename T,
          int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
          int ROWS_PER_CTA = WARPS_M,
          int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
-          int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
+          int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA,
+          typename InType = T,
+          typename OutType = T>
 __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
    int rows,
    int cols,
@@ -428,7 +430,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
    const bool is_test,
    const uint64_t increment,
    const float epsilon,
-    const T *__restrict__ x_ptr,
+    const InType *__restrict__ x_ptr,
    const T *__restrict__ residual_ptr,
    const T *__restrict__ bias_ptr,
    const ScaleT *__restrict__ gamma_ptr,
@@ -437,10 +439,20 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
    U *__restrict__ mean_out_ptr,
    U *__restrict__ var_out_ptr,
    T *__restrict__ residual_out_ptr,
-    T *__restrict__ y_ptr) {
+    OutType *__restrict__ y_ptr,
+    const float quant_last_in_scale = 1.0,
+    const float *__restrict__ quant_out_scale_ptr = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
  __shared__ U smem[WARPS_M * WARPS_N];
  using Vec = phi::AlignedVector<T, VecSize>;
  using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using Vec_in_type = phi::AlignedVector<InType, VecSize>;
+  using Vec_out_type = phi::AlignedVector<OutType, VecSize>;
+  using Vec_float = phi::AlignedVector<float, VecSize>;
  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;

  const int tidx = threadIdx.x;
@@ -481,12 +493,21 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
  constexpr U rn = 1.f / U(ELTS_PER_ROW);
  for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
    Vec x[LDGS];
+    Vec_in_type x_input[LDGS];
    Vec residual[LDGS];
+    Vec_float dequant_out_scale[LDGS];
+
 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
      phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
                            &residual[it]);
+      phi::Load<InType, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize,
+                                 &x_input[it]);
+      if (quant_out_scale_ptr != nullptr) {
+        phi::Load<float, VecSize>(
+            quant_out_scale_ptr + quant_out_scale_offset + col * VecSize,
+            &dequant_out_scale[it]);
+      }
      col += THREADS_PER_ROW;
    }

@@ -520,20 +541,42 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 #pragma unroll
        for (int jt = 0; jt < VecSize; jt++) {
          // dropout(x) + residual
-          x[it][jt] = (x[it][jt] + bias[it][jt]) *
+          if (std::is_same<InType, int32_t>::value) {
+            T tmp = (static_cast<T>(static_cast<float>(x_input[it][jt]) *
+                                    quant_last_in_scale /
+                                    dequant_out_scale[it][jt]) +
+                     bias[it][jt]) *
+                        static_cast<T>(mask_vec[it][jt]) * factor +
+                    residual[it][jt];
+            x[it][jt] = tmp;
+            xf[it * VecSize + jt] = U(tmp);
+          } else {
+            x[it][jt] = (static_cast<T>(x_input[it][jt]) + bias[it][jt]) *
                            static_cast<T>(mask_vec[it][jt]) * factor +
                        residual[it][jt];
            xf[it * VecSize + jt] = U(x[it][jt]);
          }
        }
+      }
    } else {
 #pragma unroll
      for (int it = 0; it < LDGS; it++) {
 #pragma unroll
        for (int jt = 0; jt < VecSize; jt++) {
          // dropout(x) + residual
-          x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
+          if (std::is_same<InType, int32_t>::value) {
+            // for int32 input, we need to dequantize.
+            T tmp = static_cast<T>(static_cast<float>(x_input[it][jt]) *
+                                   quant_last_in_scale /
+                                   dequant_out_scale[it][jt]) *
+                        static_cast<T>(mask_vec[it][jt]) * factor +
                    residual[it][jt];
+            x[it][jt] = tmp;
+          } else {
+            x[it][jt] = static_cast<T>(x_input[it][jt]) *
+                            static_cast<T>(mask_vec[it][jt]) * factor +
+                        residual[it][jt];
+          }
          xf[it * VecSize + jt] = U(x[it][jt]);
        }
      }
@@ -626,6 +669,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
      var_out_ptr[row] = var_local * rn;
    }

+    Vec_out_type x_output[LDGS];
+
 #pragma unroll
    for (int it = 0; it < LDGS; it++) {
 #pragma unroll
@@ -638,12 +683,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
        U tmp = rsigma * (static_cast<U>(xf[it * VecSize + jt]) - mu_local);
        x[it][jt] = static_cast<T>(static_cast<U>(gamma[it][jt]) * tmp +
                                   static_cast<U>(beta[it][jt]));
+
+        if (std::is_same<OutType, int8_t>::value)
+          x_output[it][jt] = quant_helper(x[it][jt],
+                                          quant_next_in_scale,
+                                          quant_round_type,
+                                          quant_max_bound,
+                                          quant_min_bound);
      }
    }

 #pragma unroll
    for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      if (std::is_same<OutType, int8_t>::value) {
+        phi::Store<OutType, VecSize>(
+            x_output[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      } else {
+        phi::Store<T, VecSize>(
+            x[it],
+            reinterpret_cast<T *>(y_ptr) + row * ELTS_PER_ROW + col * VecSize);
+      }
      col += THREADS_PER_ROW;
    }
  }
@@ -668,7 +727,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 template <typename T,
          typename MaskType,
          typename U,
-          bool ScaleBiasWithSameTypeX = false>
+          bool ScaleBiasWithSameTypeX = false,
+          typename InType = T,
+          typename OutType = T>
 void LaunchLayernormResidualDropoutBias(
    const uint32_t rows,
    const uint32_t cols,
@@ -678,18 +739,26 @@ void LaunchLayernormResidualDropoutBias(
    const float epsilon,
    const bool is_upscale_in_train,
    const bool is_test,
-    const T *src,
+    const InType *src,
    const T *residual,
    const T *bias,
    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
    MaskType *mask_data,
    T *dst,
-    T *layernorm_dst,
+    OutType *layernorm_dst,
    LayerNormParamType<T> *mean,
    LayerNormParamType<T> *var,
-    const phi::GPUContext &ctx) {
+    const phi::GPUContext &ctx,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
  // dropout_prob == 1.0f
+  // NOTE(minghaoBD): OutType should be T if drop_out_rate == 1.0
  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
    auto cuda_place = ctx.GetPlace();
    memory::Copy(cuda_place,
@@ -705,10 +774,11 @@ void LaunchLayernormResidualDropoutBias(
    switch (GetDesiredBlockDim(cols)) {
      FIXED_BLOCK_DIM_CASE(
          LayerNormForward<T, U, kBlockDim, ScaleBiasWithSameTypeX>
-          <<<rows, kBlockDim, 0, ctx.stream()>>>(dst,
+          <<<rows, kBlockDim, 0, ctx.stream()>>>(
+              dst,
              scale,
              layernorm_bias,
-                                                 layernorm_dst,
+              reinterpret_cast<T *>(layernorm_dst),
              mean,
              var,
              epsilon,
@@ -731,6 +801,9 @@ void LaunchLayernormResidualDropoutBias(
    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP;                   \
+    const int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW * VecSize;               \
+    const int LDGS = cols / ELTS_PER_ROW_PER_CTA;                             \
    const int grid =                                                          \
        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
    fused_fast_ln_fwd_kernel<                                                 \
@@ -742,7 +815,16 @@ void LaunchLayernormResidualDropoutBias(
        WARPS_M,                                                              \
        WARPS_N,                                                              \
        BYTES_PER_LDG,                                                        \
-        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows,                \
+        cols,                                                                 \
+        THREADS_PER_WARP,                                                     \
+        THREADS_PER_ROW,                                                      \
+        THREADS_PER_CTA,                                                      \
+        ROWS_PER_CTA,                                                         \
+        ELTS_PER_ROW_PER_CTA,                                                 \
+        LDGS,                                                                 \
+        InType,                                                               \
+        OutType>                                                              \
+        <<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows,                    \
                                                     cols,                    \
                                                     seed,                    \
                                                     dropout_prob,            \
@@ -759,7 +841,14 @@ void LaunchLayernormResidualDropoutBias(
                                                     mean,                    \
                                                     var,                     \
                                                     dst,                     \
-                                                          layernorm_dst);      \
+                                                     layernorm_dst,           \
+                                                     quant_last_in_scale,     \
+                                                     dequant_out_scale_data,  \
+                                                     quant_out_scale_offset,  \
+                                                     quant_next_in_scale,     \
+                                                     quant_round_type,        \
+                                                     quant_max_bound,         \
+                                                     quant_min_bound);        \
  } break

 #define LAUNCH_FUSED_FAST_LN_KERNEL       \
@@ -784,7 +873,8 @@ void LaunchLayernormResidualDropoutBias(
  if (cols % VecSize != 0) {
    int blockDim = GetDesiredBlockDim(cols);
    FusedLayernormResidualDropoutBias<T, uint8_t, 1, U, ScaleBiasWithSameTypeX>
-        <<<rows, blockDim, 0, ctx.stream()>>>(rows,
+        <<<rows, blockDim, 0, ctx.stream()>>>(
+            rows,
            cols,
            seed,
            dropout_prob,
@@ -792,14 +882,14 @@ void LaunchLayernormResidualDropoutBias(
            is_test,
            increment,
            epsilon,
-                                              src,
+            reinterpret_cast<const T *>(src),
            residual,
            bias,
            scale,
            layernorm_bias,
            mask_data,
            dst,
-                                              layernorm_dst,
+            reinterpret_cast<T *>(layernorm_dst),
            mean,
            var);
  } else {
@@ -819,7 +909,8 @@ void LaunchLayernormResidualDropoutBias(
                                        VecSize,
                                        U,
                                        ScaleBiasWithSameTypeX>
-          <<<rows, blockDim, 0, ctx.stream()>>>(rows,
+          <<<rows, blockDim, 0, ctx.stream()>>>(
+              rows,
              cols,
              seed,
              dropout_prob,
@@ -827,14 +918,14 @@ void LaunchLayernormResidualDropoutBias(
              is_test,
              increment,
              epsilon,
-                                                src,
+              reinterpret_cast<const T *>(src),
              residual,
              bias,
              scale,
              layernorm_bias,
              mask_data,
              dst,
-                                                layernorm_dst,
+              reinterpret_cast<T *>(layernorm_dst),
              mean,
              var);
    }

--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
+ private:
+  static constexpr const char *OpName = "FusedMultiTransformerINT8Op";
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+#define CHECK_INPUT(name) \
+  OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
+#define CHECK_INPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
+#define CHECK_OUTPUT(name) \
+  OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
+#define CHECK_OUTPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
+
+    CHECK_INPUT(X);
+
+    // attention
+    CHECK_INPUTS(QKVW);
+    CHECK_INPUTS(OutLinearW);
+
+    if (ctx->HasInput("TimeStep")) {
+      CHECK_INPUTS(CacheKV);
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      CHECK_OUTPUTS(CacheKVOut);
+    }
+
+    // ffn
+    CHECK_INPUTS(FFN1Weight);
+    CHECK_INPUTS(FFN2Weight);
+
+    CHECK_OUTPUT(Out);
+
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputsDim("QKVW")[0];
+    bool trans_qkvw = ctx->Attrs().Get<bool>("trans_qkvw");
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(),
+        3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      4,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 4"
+                          "(3, num_head, dim_head, dim_embed),"
+                          "but received dimensions of"
+                          "Input is [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim[2],
+        trans_qkvw ? y_dim[3] : y_dim[0],
+        platform::errors::InvalidArgument(
+            "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
+            "true) or y_dim[0](trans_qkvw is false)"
+            "must be equal. But received: the shape "
+            "of input x = [%s], and the shape of "
+            "input qkv_weight = [%s]",
+            x_dim,
+            y_dim));
+
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      if (trans_qkvw) {
+        PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
+                          y_dim[3],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(3, num_head, dim_head, dim_embed),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+
+      } else {
+        PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
+                          y_dim[0],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(dim_embed, 3, num_head, dim_head),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+      }
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      // [2, batch_size, num_head, max_seq_len, head_size]
+      const auto &c_dims = ctx->GetInputsDim("CacheKV");
+      const auto &c_dim = c_dims[0];
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(),
+          5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0],
+                        2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1],
+                        x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0],
+                            c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2],
+                        trans_qkvw ? y_dim[1] : y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            trans_qkvw ? y_dim[1] : y_dim[2],
+                            c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GT(
+          c_dim[3],
+          0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4],
+                        trans_qkvw ? y_dim[2] : y_dim[3],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            trans_qkvw ? y_dim[2] : y_dim[3],
+                            c_dim[4]));  // head_size
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name,
+      const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    if (var_name == "TimeStep") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(
+        expected_kernel_type.data_type_, tensor.place(), tensor.layout());
+  }
+};
+
+class FusedMultiTransformerINT8OpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("LnScale",
+             "Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("LnBias",
+             "Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("QKVW", "The qkv weight tensor.").AsDuplicable();
+    AddInput("QKVBias", "The qkv bias tensor.").AsDispensable().AsDuplicable();
+
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable()
+        .AsDuplicable();
+    AddInput("TimeStep",
+             "(optional, int) The time step for generation inference.")
+        .AsDispensable();
+    AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
+        .AsDispensable();
+    AddInput("OutLinearW", "The out_linear weight tensor.").AsDuplicable();
+    AddInput("OutLinearBias", "The out_linear bias tensor.")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("FFNLnScale", "The layer_norm scale of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFNLnBias", "The layer_norm bias of FusedFeedForward op")
+        .AsDuplicable();
+
+    AddInput("FFN1Weight", "The linear1 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN1Bias", "The linear1 bias of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("FFN2Weight", "The linear2 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN2Bias", "The linear2 bias input of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("QKVOutScale",
+             "QKVOutScale is used to dequantize qkv output tensor."
+             "In order to keep consistent with the PTQ/QAT calculation logic,"
+             "QKVOutScale should be max_bound * max_bound / max_range."
+             "Here max_range is per-channel weight scale."
+             "The shape of QKVOutScale is [num_layers, num_channels]")
+        .AsDispensable();
+    AddInput("OutLinearOutScale",
+             "OutLinearOutScale is used to dequantize out_linear output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+    AddInput("FFN1OutScale",
+             "FFN1OutScale is used to dequantize ffn1 output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+    AddInput("FFN2OutScale",
+             "FFN2OutScale is used to dequantize ffn2 output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+
+    AddOutput("CacheKVOut", "The updated cache KV. Inplace with CacheKV")
+        .AsDispensable()
+        .AsDuplicable();
+    AddOutput("Out", "Result after multi .");
+
+    AddAttr<bool>("pre_layer_norm",
+                  "if true, the attention op uses pre_layer_norm architecure, "
+                  "else, uses post_layer_norm architecuture. "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
+                            true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' in Op(LayerNorm) should be between"
+                                "0.0 and 0.001, But received [%s].",
+                                epsilon));
+        });
+
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
+                            true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train",
+              true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
+    AddAttr<bool>(
+        "trans_qkvw",
+        "Whether the weights of qkv should be transposed. If true,"
+        "the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
+        "Otherwise the shape of weights of qkv should be"
+        "[dim_embed, 3, num_head, dim_head]")
+        .SetDefault(true);
+
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
+
+    AddAttr<int>("num_head", "num_head").SetDefault(0);
+    AddAttr<int>("dim_head", "dim_head").SetDefault(0);
+    AddAttr<int>("dim_ffn", "dim_ffn").SetDefault(0);
+
+    AddAttr<std::vector<float>>(
+        "qkv_in_scale",
+        "qkv_in_scale is used to quantize qkv input tensor."
+        "in_scale is generated by PTQ or QAT, which represents valid max range "
+        "of this tensor."
+        "the size of qkv_in_scale should be num_layers, which is equal to "
+        "QKVW.dims()[0]")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "out_linear_in_scale",
+        "out_linear_in_scale is used to quantize out_linear input tensor."
+        "the size of out_linear_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "ffn1_in_scale",
+        "ffn1_in_scale is used to quantize ffn1 input tensor."
+        "the size of ffn1_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "ffn2_in_scale",
+        "ffn2_in_scale is used to quantize ffn2 input tensor."
+        "the size of ffn2_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+
+    AddAttr<int>(
+        "quant_round_type",
+        "(int, default 1) The round type of fp32 to int."
+        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
+        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
+        "round(-2.5)=-3")
+        .SetDefault(1);
+    AddAttr<float>(
+        "quant_max_bound",
+        "(float, default 127.0) the max bound of float type to int type")
+        .SetDefault(127.0);
+    AddAttr<float>(
+        "quant_min_bound",
+        "(float, default -127.0) the min bound of float type to int type")
+        .SetDefault(-127.0);
+
+    AddComment(R"DOC(fused multi transformer layers op)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_multi_transformer_int8,
+    ops::FusedMultiTransformerINT8Op,
+    ops::FusedMultiTransformerINT8OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.h
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -28,7 +28,9 @@ template <typename T,
          int VecSize,
          bool ComputeLayerNorm,
          bool Activation,
-          typename Functor>
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
 __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
    const int row_id,
    const int col_id,
@@ -36,30 +38,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
    curandStatePhilox4_32_10_t *state,
    const float dropout_prob,
    const T factor,
-    const T *__restrict__ src,
+    const InType *__restrict__ src,
    const T *__restrict__ residual,
    const T *__restrict__ bias,
-    T *dst,
+    OutType *dst,
    MaskType *mask,
    const bool is_test,
    typename details::MPTypeTrait<T>::Type *mean_val,
    typename details::MPTypeTrait<T>::Type *var_val,
-    Functor act_func) {
+    Functor act_func,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
  using LoadT = phi::AlignedVector<T, VecSize>;
+  using LoadInType = phi::AlignedVector<InType, VecSize>;
+  using LoadFloat = phi::AlignedVector<float, VecSize>;
  using StoreT = phi::AlignedVector<T, VecSize>;
+  using StoreOutType = phi::AlignedVector<OutType, VecSize>;
+
  using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
  using U = typename details::MPTypeTrait<T>::Type;

-  LoadT src_vec;
+  LoadInType src_vec;
  LoadT residual_vec;
  LoadT bias_vec;
+  LoadFloat quant_out_scale_vec;
 #pragma unroll
  for (int ii = 0; ii < VecSize; ii++) {
    bias_vec[ii] = static_cast<T>(0);
    residual_vec[ii] = static_cast<T>(0);
  }
  // vectorize load data from global
-  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<InType, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<float, VecSize>(
+      &dequant_out_scale_data[quant_out_scale_offset + col_id],
+      &quant_out_scale_vec);
  if (residual) {
    phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
  }
@@ -84,10 +101,18 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
  }

  StoreT dest_vec;
+  StoreOutType dest_vec_out_type;

 #pragma unroll
  for (int ii = 0; ii < VecSize; ii++) {
-    T tmp = src_vec[ii] + bias_vec[ii];
+    T tmp;
+    if (std::is_same<InType, int32_t>::value) {
+      T tmp0 = static_cast<T>(static_cast<float>(src_vec[ii]) *
+                              quant_last_in_scale / quant_out_scale_vec[ii]);
+      tmp = tmp0 + bias_vec[ii];
+    } else {
+      tmp = static_cast<T>(src_vec[ii]) + bias_vec[ii];
+    }
    if (Activation) {
      tmp = act_func(tmp);
    }
@@ -98,10 +123,23 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
      *mean_val += tmp;
      *var_val += (tmp * tmp);
    }
+    if (std::is_same<OutType, int8_t>::value) {
+      dest_vec_out_type[ii] = quant_helper(dest_vec[ii],
+                                           quant_next_in_scale,
+                                           quant_round_type,
+                                           quant_max_bound,
+                                           quant_min_bound);
+    }
  }

  // store result to global
-  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  if (std::is_same<OutType, int8_t>::value) {
+    phi::Store<OutType, VecSize>(dest_vec_out_type,
+                                 &dst[row_id * cols + col_id]);
+  } else {
+    phi::Store<T, VecSize>(dest_vec,
+                           reinterpret_cast<T *>(&dst[row_id * cols + col_id]));
+  }
  if (!is_test) {
    phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
  }
@@ -114,19 +152,28 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
 * is_test: only used in inference
 * mask: can be null if is_test=true
 */
-template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutBias(const size_t rows,
+template <typename T,
+          typename MaskType,
+          int VecSize,
+          typename InType = T,
+          typename OutType = T>
+__global__ void FusedResidualDropoutBias(
+    const size_t rows,
    const size_t cols,
    uint64_t seed,
    const float dropout_prob,
    const bool is_upscale_in_train,
-                                         const T *__restrict__ src,
+    const InType *__restrict__ src,
    const T *__restrict__ residual,
    const T *__restrict__ bias,
    MaskType *mask,
-                                         T *dst,
+    OutType *dst,
    uint64_t increment,
-                                         const bool is_test) {
+    const bool is_test,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0) {
  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
  int row_id = blockIdx.y;
  int idx = row_id * cols + col_id;
@@ -142,8 +189,9 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
                                        VecSize,
                                        false,
                                        false,
-                                        phi::funcs::ReluFunctor<T>>(
-          r,
+                                        phi::funcs::ReluFunctor<T>,
+                                        InType,
+                                        OutType>(r,
                                                 i,
                                                 cols,
                                                 &state,
@@ -157,7 +205,11 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
                                                 is_test,
                                                 nullptr,
                                                 nullptr,
-          relu);
+                                                 relu,
+                                                 quant_last_in_scale,
+                                                 dequant_out_scale_data,
+                                                 quant_out_scale_offset,
+                                                 quant_next_in_scale);
    }
  }
 }
@@ -165,7 +217,10 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
 /**
 * @brief dst = residual + dropout(src + bias);
 */
-template <typename T, typename MaskType>
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
 void LaunchResidualDropoutBias(const uint32_t rows,
                               const uint32_t cols,
                               const int increment,
@@ -173,14 +228,19 @@ void LaunchResidualDropoutBias(const uint32_t rows,
                               const float dropout_prob,
                               const bool is_test,
                               bool is_upscale_in_train,
-                               const T *src,
+                               const InType *src,
                               const T *residual,
                               const T *bias,
                               MaskType *mask_data,
-                               T *dst,
-                               const phi::GPUContext &ctx) {
+                               OutType *dst,
+                               const phi::GPUContext &ctx,
+                               const float quant_last_in_scale = 1.0,
+                               const float *dequant_out_scale_data = nullptr,
+                               const int quant_out_scale_offset = 0,
+                               const float quant_next_in_scale = 1.0) {
  // dropout_prob == 1.0f
  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
+    // NOTE(minghaoBD): OutType should be T if dropout_prob == 1.0
    if (residual == dst) return;
    if (residual) {
      memory::Copy(ctx.GetPlace(),
@@ -202,7 +262,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
  const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
  auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
  if (cols % VecSize == 0) {
-    FusedResidualDropoutBias<T, uint8_t, VecSize>
+    FusedResidualDropoutBias<T, uint8_t, VecSize, InType, OutType>
        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
            rows,
            cols,
@@ -215,9 +275,13 @@ void LaunchResidualDropoutBias(const uint32_t rows,
            mask_data,
            dst,
            increment,
-            is_test);
+            is_test,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
  } else {
-    FusedResidualDropoutBias<T, uint8_t, 1>
+    FusedResidualDropoutBias<T, uint8_t, 1, InType, OutType>
        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
            rows,
            cols,
@@ -230,7 +294,11 @@ void LaunchResidualDropoutBias(const uint32_t rows,
            mask_data,
            dst,
            increment,
-            is_test);
+            is_test,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
  }
 }


--- a/paddle/fluid/operators/fused/quant_dequant_kernel.h
+++ b/paddle/fluid/operators/fused/quant_dequant_kernel.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__forceinline__ __device__ int8_t quant_helper(const T input,
+                                               const float scale,
+                                               const int round_type,
+                                               const float max_bound,
+                                               const float min_bound) {
+  float quant_value = max_bound * inverse(scale) * static_cast<float>(input);
+  if (round_type == 0) {
+    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
+  } else {
+    quant_value = static_cast<float>(round(quant_value));
+  }
+  quant_value = quant_value > max_bound ? max_bound : quant_value;
+  quant_value = quant_value < min_bound ? min_bound : quant_value;
+  return static_cast<int8_t>(quant_value);
+}
+
+template <typename T>
+__global__ void quantize_kernel(const T* input,
+                                char4* output,
+                                const float scale,
+                                const int m,
+                                const int n,
+                                const int round_type,
+                                const float max_bound,
+                                const float min_bound) {
+  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+
+  bool check = ((m_id < m) && (n_id < n));
+  if (check) {
+    char4 tmp;
+    tmp.x = quant_helper(
+        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
+    tmp.y = quant_helper(
+        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
+    tmp.z = quant_helper(
+        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
+    tmp.w = quant_helper(
+        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
+    output[(m_id * n + n_id) >> 2] = tmp;
+  }
+}
+
+template <typename T>
+void quantize_kernel_launcher(const T* input,
+                              int8_t* output,
+                              const float scale,
+                              const int m,
+                              const int n,
+                              const int round_type,
+                              const float max_bound,
+                              const float min_bound,
+                              gpuStream_t stream) {
+  // TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
+  dim3 grid((n + 31) / 32, (m + 31) / 32);
+  dim3 block(32, 32);
+
+  quantize_kernel<<<grid, block, 0, stream>>>(input,
+                                              (char4*)output,  // NOLINT
+                                              scale,
+                                              m,
+                                              n,
+                                              round_type,
+                                              max_bound,
+                                              min_bound);
+}
+
+// dequantize using weight scales and input scales
+template <typename T>
+__global__ void dequantize_kernel(T* output,
+                                  const int32_t* input,
+                                  const int m,  // hidden
+                                  const int n,  // batch size
+                                  const float quant_in_scale,
+                                  const float* dequant_out_scale_data,
+                                  const int quant_out_scale_offset) {
+  int m_id = blockIdx.x * blockDim.x + threadIdx.x;  // hidden
+  int n_id = blockIdx.y * blockDim.y + threadIdx.y;  // batch size
+
+  bool check = ((m_id < m) && (n_id < n));
+  if (check) {
+    float out_scale = dequant_out_scale_data[quant_out_scale_offset + m_id];
+    output[n_id * m + m_id] =
+        static_cast<T>(static_cast<float>(input[n_id * m + m_id]) *
+                       quant_in_scale / out_scale);
+  }
+}
+
+template <typename T>
+void dequantize_kernel_launcher(const int32_t* input,
+                                T* output,
+                                const int batch_size,    // m
+                                const int hidden_units,  // n
+                                gpuStream_t stream,
+                                const float quant_in_scale,
+                                const float* dequant_out_scale_data,
+                                const int quant_out_scale_offset) {
+  dim3 grid((hidden_units + 31) / 32, (batch_size + 31) / 32);
+  dim3 block(32, 32);
+
+  dequantize_kernel<<<grid, block, 0, stream>>>(output,
+                                                input,
+                                                hidden_units,
+                                                batch_size,
+                                                quant_in_scale,
+                                                dequant_out_scale_data,
+                                                quant_out_scale_offset);
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;

 #include <iostream>

+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
@@ -338,16 +339,24 @@ using LayerNormScaleBiasT =
 template <typename T,
          typename U,
          int BlockDim,
-          bool ScaleBiasWithSameTypeX = false>
+          bool ScaleBiasWithSameTypeX = false,
+          typename InType = T,
+          typename OutType = T>
 __global__ void LayerNormForward(
-    const T *x,
+    const InType *x,
    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias,
-    T *y,
+    OutType *y,
    U *mean,
    U *var,
    float epsilon,
-    int64_t feature_size) {
+    int64_t feature_size,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
  __shared__ U mean_share;
  __shared__ U var_share;
  __shared__ U shared_mean[32];  // threadIdx.x / warpSize <= kMaxBlockDim /
@@ -387,28 +396,72 @@ __global__ void LayerNormForward(
    if (bias != nullptr) {
      for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
           i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>(static_cast<U>(scale[j]) *
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>(static_cast<U>(scale[j]) *
                                 (static_cast<U>(x[i]) - mean_val) * invvar +
+                             static_cast<U>(bias[j])),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] = static_cast<OutType>(static_cast<U>(scale[j]) *
+                                          (static_cast<U>(x[i]) - mean_val) *
+                                          invvar +
                                      static_cast<U>(bias[j]));
        }
+      }
    } else {
      for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
           i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>(static_cast<U>(scale[j]) *
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>(static_cast<U>(scale[j]) *
+                             (static_cast<U>(x[i]) - mean_val) * invvar),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>(static_cast<U>(scale[j]) *
                                   (static_cast<U>(x[i]) - mean_val) * invvar);
        }
      }
+    }
  } else {  // scale == nullptr
    if (bias != nullptr) {
      for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
           i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+                             static_cast<U>(bias[j])),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar +
                                   static_cast<U>(bias[j]));
        }
+      }
    } else {
      for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
           i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar);
+        }
      }
    }
  }

--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -59,7 +59,9 @@ namespace dynload {
  __macro(cublasLtMatrixTransform);                 \
  __macro(cublasLtMatrixTransformDescCreate);       \
  __macro(cublasLtMatrixTransformDescDestroy);      \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);

 CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif

--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -73,7 +73,9 @@ extern void *cublasLt_dso_handle;
  __macro(cublasLtMatrixTransform);                 \
  __macro(cublasLtMatrixTransformDescCreate);       \
  __macro(cublasLtMatrixTransformDescDestroy);      \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);

 CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif

--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -326,7 +326,7 @@ void* GetCublasDsoHandle() {

 void* GetCublasLtDsoHandle() {
 // APIs available after CUDA 10.1
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10010
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
 #else
  std::string warning_msg(

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py