未验证 提交 3d7e2118 编写于 作者: R RichardWooSJTU 提交者: GitHub

Add INT8 support for fused_multi_transformer_op (#45284)

上级 7f346a76
......@@ -165,7 +165,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
auto var_data_type = var_node->Var()->GetDataType();
VLOG(5) << "var_name is " << var_name << ", data type is "
<< var_data_type;
if (var_data_type == paddle::framework::proto::VarType::FP16) {
if (var_data_type == paddle::framework::proto::VarType::FP16 &&
t->dtype() != paddle::experimental::DataType::FLOAT16) {
framework::Tensor half_tensor;
half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
half_tensor.Resize(t->dims());
......
......@@ -23,6 +23,7 @@ register_operators(
fused_transformer_op
fused_feedforward_op
fused_multi_transformer_op
fused_multi_transformer_int8_op
fused_bias_dropout_residual_layer_norm_op
resnet_unit_op
fused_gemm_epilogue_op
......@@ -119,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
# fused_attention_op
op_library(fused_attention_op)
op_library(fused_multi_transformer_op)
op_library(fused_multi_transformer_int8_op)
op_library(fused_bias_dropout_residual_layer_norm_op)
endif()
# resnet_unit needs cudnn 8.0 above
......
......@@ -19,7 +19,8 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
// NOTE: T must be the same as OutType in ComputeBackward
template <typename T, typename InType = T, typename OutType = T>
class AttnLayerNorm {
public:
AttnLayerNorm(const phi::GPUContext& dev_ctx,
......@@ -33,17 +34,28 @@ class AttnLayerNorm {
~AttnLayerNorm() {}
void ComputeForward(const T* x_data,
void ComputeForward(const InType* x_data,
const LayerNormParamType<T>* scale_data,
const LayerNormParamType<T>* bias_data,
T* y_data,
OutType* y_data,
LayerNormParamType<T>* mean_data,
LayerNormParamType<T>* var_data) {
LayerNormParamType<T>* var_data,
const float* dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
auto stream = dev_ctx_.stream();
switch (GetDesiredBlockDim(feature_size_)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
LayerNormForward<T,
LayerNormParamType<T>,
kBlockDim,
false,
InType,
OutType>
<<<batch_size_, kBlockDim, 0, stream>>>(x_data,
scale_data,
bias_data,
......@@ -51,7 +63,13 @@ class AttnLayerNorm {
mean_data,
var_data,
epsilon_,
feature_size_));
feature_size_,
dequant_out_scale_data,
quant_out_scale_offset,
quant_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound));
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Feature_size must be larger than 1"));
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <vector>
#include "paddle/fluid/operators/fused/cublaslt.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class AttnMatmulINT8 {
public:
AttnMatmulINT8(
const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
: dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
auto helper = std::make_shared<CublasLtHelper>(m, k, n);
helpers_.emplace_back(helper);
}
~AttnMatmulINT8() {}
// This function is used to execute GEMM, with input and output's types are
// both T.
void ComputeForward(const framework::Tensor* weight,
const framework::Tensor* input,
framework::Tensor* input_tmp,
const framework::Tensor* bias,
framework::Tensor* output,
framework::Tensor* output_tmp,
framework::Tensor* bias_out,
const float quant_in_scale,
const framework::Tensor* dequant_out_scale,
const int quant_out_scale_offset,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
quantize_kernel_launcher<T>(input->data<T>(),
input_tmp->data<int8_t>(),
quant_in_scale,
m_,
k_,
quant_round_type,
quant_max_bound,
quant_min_bound,
dev_ctx_.stream());
helpers_[0]->GEMM(input_tmp->data<int8_t>(),
weight->data<int8_t>(),
output_tmp->data<int32_t>(),
dev_ctx_.stream());
dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
output->data<T>(),
m_,
n_,
dev_ctx_.stream(),
quant_in_scale,
dequant_out_scale->data<float>(),
quant_out_scale_offset);
if (compute_bias_) {
// bias_out = output + bias
std::vector<const framework::Tensor*> ins = {output, bias};
std::vector<framework::Tensor*> outs = {bias_out};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
PADDLE_ENFORCE_EQ(cudaGetLastError(),
cudaSuccess,
platform::errors::Fatal(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"));
}
}
// This function is used to execute GEMM, with input and output's types are
// both INT8.
void ComputeForwardINT8ToINT8(const framework::Tensor* weight,
framework::Tensor* input,
const framework::Tensor* bias,
framework::Tensor* output,
framework::Tensor* bias_out) {
helpers_[0]->GEMM(input->data<int8_t>(),
weight->data<int8_t>(),
output->data<int32_t>(),
dev_ctx_.stream());
}
// This function is used to execute GEMM, with input and output's types are
// INT8 and T.
void ComputeForwardINT8ToT(const framework::Tensor* weight,
const float quant_in_scale,
framework::Tensor* input,
const framework::Tensor* bias,
framework::Tensor* output,
framework::Tensor* output_tmp,
framework::Tensor* bias_out,
const framework::Tensor* dequant_out_scale,
const int quant_out_scale_offset) {
helpers_[0]->GEMM(input->data<int8_t>(),
weight->data<int8_t>(),
output_tmp->data<int32_t>(),
dev_ctx_.stream());
dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
output->data<T>(),
m_,
n_,
dev_ctx_.stream(),
quant_in_scale,
dequant_out_scale->data<float>(),
quant_out_scale_offset);
if (compute_bias_) {
// bias_out = output + bias
std::vector<const framework::Tensor*> ins = {output, bias};
std::vector<framework::Tensor*> outs = {bias_out};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
PADDLE_ENFORCE_EQ(cudaGetLastError(),
cudaSuccess,
platform::errors::Fatal(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"));
}
}
// This function is used to execute GEMM, with input and output's types are T
// and INT8.
void ComputeForwardTToINT8(const framework::Tensor* weight,
const float quant_in_scale,
const framework::Tensor* input,
framework::Tensor* input_tmp,
const framework::Tensor* bias,
framework::Tensor* output,
framework::Tensor* bias_out,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
quantize_kernel_launcher<T>(input->data<T>(),
input_tmp->data<int8_t>(),
quant_in_scale,
m_,
k_,
quant_round_type,
quant_max_bound,
quant_min_bound,
dev_ctx_.stream());
helpers_[0]->GEMM(input_tmp->data<int8_t>(),
weight->data<int8_t>(),
output->data<int32_t>(),
dev_ctx_.stream());
}
private:
const phi::GPUContext& dev_ctx_;
int m_; // m
int n_; // n
int k_; // k
int compute_bias_;
std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sstream>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
namespace dyl = paddle::platform::dynload;
namespace paddle {
namespace operators {
class CublasLtHelper {
public:
CublasLtHelper(int m, int k, int n)
: alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
cublasStatus_t status;
// handle and matmul desc
status = dyl::cublasLtCreate(&handle_);
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32I;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
#endif
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
#if CUBLAS_VER_MAJOR < 11
status = dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType);
#else
status = dyl::cublasLtMatmulDescCreate(
&matmul_desc_, cudaComputeType, CUDA_R_32I);
#endif
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatmulDescCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
cublasOperation_t op_transpose = CUBLAS_OP_T;
status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
CUBLASLT_MATMUL_DESC_TRANSA,
&op_transpose,
sizeof(op_transpose));
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatmulDescSetAttribute execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
// matrix desc
status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, CUDA_R_8I, k, n, k);
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, CUDA_R_8I, k, m, k);
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, CUDA_R_32I, n, m, n);
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
}
~CublasLtHelper() {
if (handle_) dyl::cublasLtDestroy(handle_);
if (matmul_desc_) dyl::cublasLtMatmulDescDestroy(matmul_desc_);
if (A_desc_) dyl::cublasLtMatrixLayoutDestroy(A_desc_);
if (B_desc_) dyl::cublasLtMatrixLayoutDestroy(B_desc_);
if (C_desc_) dyl::cublasLtMatrixLayoutDestroy(C_desc_);
}
void GEMM(int8_t* A_dev,
const int8_t* B_dev,
int32_t* C_dev,
cudaStream_t stream) {
cublasStatus_t status;
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
cublasLtMatmulAlgo_t algo;
int algoId = 21;
int swizzle = 0;
int customOption = 0;
int tile = 15;
int splitK_val = 0;
int reductionScheme = 0;
#if CUDA_VERSION >= 11000
int stages = 23;
#endif
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t cudaComputeType = CUDA_R_32I;
#else
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
#endif
dyl::cublasLtMatmulAlgoInit(handle_,
cudaComputeType,
CUDA_R_32I,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_32I,
CUDA_R_32I,
algoId,
&algo);
dyl::cublasLtMatmulAlgoConfigSetAttribute(
&algo,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
&(customOption),
sizeof(customOption));
dyl::cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
dyl::cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&(splitK_val),
sizeof(splitK_val));
dyl::cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
dyl::cublasLtMatmulAlgoConfigSetAttribute(
&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&(reductionScheme),
sizeof(int));
#if CUDA_VERSION >= 11000
dyl::cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
#endif
#endif
status = dyl::cublasLtMatmul(handle_,
matmul_desc_,
&alpha_,
B_dev,
B_desc_,
A_dev,
A_desc_,
&beta_,
C_dev,
C_desc_,
C_dev,
C_desc_,
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
&algo,
#else
nullptr,
#endif
nullptr,
0,
stream);
PADDLE_ENFORCE_EQ(
status,
CUBLAS_STATUS_SUCCESS,
platform::errors::External(
"cublasLtMatmul execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"));
}
private:
cublasLtHandle_t handle_;
cublasLtMatmulDesc_t matmul_desc_;
cublasLtMatrixLayout_t A_desc_;
cublasLtMatrixLayout_t B_desc_;
cublasLtMatrixLayout_t C_desc_;
int32_t alpha_;
int32_t beta_;
int m_;
int k_;
int n_;
};
} // namespace operators
} // namespace paddle
......@@ -60,8 +60,14 @@ struct GeluGradFunctor {
* the src, mask and dst shape is (rows, cols)
* the bias shape is (1, cols)
*/
template <typename T, typename MaskType, int VecSize, typename Functor>
__global__ void FusedDropoutActBias(Functor act,
template <typename T,
typename MaskType,
int VecSize,
typename Functor,
typename InType = T,
typename OutType = T>
__global__ void FusedDropoutActBias(
Functor act,
const uint64_t seed,
const uint64_t rows,
const uint64_t cols,
......@@ -69,10 +75,17 @@ __global__ void FusedDropoutActBias(Functor act,
const float dropout_prob,
const bool is_upscale_in_train,
const bool is_test,
const T *__restrict__ src,
const InType *__restrict__ src,
const T *__restrict__ bias,
T *dst,
MaskType *mask) {
OutType *dst,
MaskType *mask,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
int col_id = blockDim.x * blockIdx.x + threadIdx.x;
int row_id = blockIdx.y;
int idx = row_id * cols + col_id;
......@@ -90,7 +103,9 @@ __global__ void FusedDropoutActBias(Functor act,
VecSize,
false,
true,
Functor>(r,
Functor,
InType,
OutType>(r,
i,
cols,
&state,
......@@ -104,7 +119,14 @@ __global__ void FusedDropoutActBias(Functor act,
is_test,
nullptr,
nullptr,
act);
act,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
}
}
}
......@@ -112,7 +134,11 @@ __global__ void FusedDropoutActBias(Functor act,
/**
* @brief dst = dropout(activation(src + bias));
*/
template <typename T, typename MaskType, typename Functor>
template <typename T,
typename MaskType,
typename Functor,
typename InType = T,
typename OutType = T>
void LaunchDropoutActBias(Functor act_functor,
const uint64_t seed,
const uint32_t rows,
......@@ -121,14 +147,21 @@ void LaunchDropoutActBias(Functor act_functor,
const float dropout_prob,
const bool is_upscale_in_train,
const bool is_test,
const T *src,
const InType *src,
const T *bias,
T *dst,
OutType *dst,
MaskType *mask_data,
const phi::GPUContext &ctx) {
const phi::GPUContext &ctx,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
// dropout_prob == 1.0f
if (std::abs(dropout_prob - 1.0f) < 1e-5) {
SetZero<T>(ctx, dst, rows * cols);
SetZero<T>(ctx, reinterpret_cast<T *>(dst), rows * cols);
SetZero<MaskType>(ctx, mask_data, rows * cols);
return;
}
......@@ -137,7 +170,7 @@ void LaunchDropoutActBias(Functor act_functor,
const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
if (cols % VecSize == 0) {
FusedDropoutActBias<T, MaskType, VecSize, Functor>
FusedDropoutActBias<T, MaskType, VecSize, Functor, InType, OutType>
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
act_functor,
seed,
......@@ -150,9 +183,13 @@ void LaunchDropoutActBias(Functor act_functor,
src,
bias,
dst,
mask_data);
mask_data,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
} else {
FusedDropoutActBias<T, MaskType, 1, Functor>
FusedDropoutActBias<T, MaskType, 1, Functor, InType, OutType>
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
act_functor,
seed,
......@@ -165,7 +202,11 @@ void LaunchDropoutActBias(Functor act_functor,
src,
bias,
dst,
mask_data);
mask_data,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
}
}
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
......
......@@ -109,7 +109,10 @@ struct DropoutParam {
}
};
template <typename T, typename MaskType>
template <typename T,
typename MaskType,
typename InType = T,
typename OutType = T>
class FusedDropoutHelper {
private:
int GetIncrement(const phi::GPUContext& ctx) {
......@@ -140,13 +143,18 @@ class FusedDropoutHelper {
// out = residual + dropout( src + bias )
void ResidualDropoutBias(const phi::GPUContext& ctx,
const T* src,
const InType* src,
const T* residual,
const T* bias,
T* out,
MaskType* mask) {
OutType* out,
MaskType* mask,
const float quant_last_in_scale = 1.0,
const float* dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0) {
auto increment = GetIncrement(ctx);
LaunchResidualDropoutBias<T, MaskType>(rows_,
LaunchResidualDropoutBias<T, MaskType, InType, OutType>(
rows_,
cols_,
increment,
dropout_param_.seed,
......@@ -158,7 +166,11 @@ class FusedDropoutHelper {
bias,
mask,
out,
ctx);
ctx,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
}
void ResidualDropoutBiasGrad(const phi::GPUContext& ctx,
......@@ -189,15 +201,22 @@ class FusedDropoutHelper {
// out = dropout(activation(src + bias))
void DropoutActBias(const phi::GPUContext& ctx,
const T* src,
const InType* src,
const T* bias,
const std::string& act_method,
T* out,
MaskType* mask) {
OutType* out,
MaskType* mask,
const float quant_last_in_scale = 1.0,
const float* dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
auto increment = GetIncrement(ctx);
if (act_method == "gelu") {
GeluFunctor<T> gelu;
LaunchDropoutActBias<T, MaskType, GeluFunctor<T>>(
LaunchDropoutActBias<T, MaskType, GeluFunctor<T>, InType, OutType>(
gelu,
dropout_param_.seed,
rows_,
......@@ -210,11 +229,21 @@ class FusedDropoutHelper {
bias,
out,
mask,
ctx);
ctx,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else if (act_method == "relu") {
phi::funcs::ReluFunctor<T> relu;
LaunchDropoutActBias<T, MaskType, phi::funcs::ReluFunctor<T>>(
relu,
LaunchDropoutActBias<T,
MaskType,
phi::funcs::ReluFunctor<T>,
InType,
OutType>(relu,
dropout_param_.seed,
rows_,
cols_,
......@@ -226,7 +255,14 @@ class FusedDropoutHelper {
bias,
out,
mask,
ctx);
ctx,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently only supports gelu or relu activation functions!"));
......@@ -283,8 +319,12 @@ class FusedDropoutHelper {
DropoutParam dropout_param_;
};
template <typename T, typename MaskType>
class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
template <typename T,
typename MaskType,
typename InType = T,
typename OutType = T>
class FusedDropoutLayerNormHelper
: public FusedDropoutHelper<T, MaskType, InType, OutType> {
public:
FusedDropoutLayerNormHelper() {}
FusedDropoutLayerNormHelper(const int rows,
......@@ -301,23 +341,24 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
const int cols,
const DropoutParam& dropout_param,
const float epsilon)
: FusedDropoutHelper<T, MaskType>(ctx, rows, cols, dropout_param) {
: FusedDropoutHelper<T, MaskType, InType, OutType>(
ctx, rows, cols, dropout_param) {
using U = LayerNormParamType<T>;
epsilon_ = epsilon;
}
// call layer_norm
void LayerNorm(const phi::GPUContext& ctx,
const T* src,
const InType* src,
const LayerNormParamType<T>* gamma,
const LayerNormParamType<T>* beta,
T* out,
OutType* out,
LayerNormParamType<T>* mean,
LayerNormParamType<T>* variance) {
using U = LayerNormParamType<T>;
switch (GetDesiredBlockDim(this->cols_)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, U, kBlockDim>
LayerNormForward<T, U, kBlockDim, false, InType, OutType>
<<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
}
......@@ -349,17 +390,25 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
// out = layernorm(residual + dropout(src + bias))
template <typename P = LayerNormParamType<T>, bool is_same_type = false>
void LayernormResidualDropoutBias(const phi::GPUContext& ctx,
const T* src,
void LayernormResidualDropoutBias(
const phi::GPUContext& ctx,
const InType* src,
const T* residual,
const T* bias,
const P* gamma,
const P* beta,
T* dropout_out,
MaskType* mask,
T* out,
OutType* out,
LayerNormParamType<T>* mean,
LayerNormParamType<T>* variance) {
LayerNormParamType<T>* variance,
const float quant_last_in_scale = 1.0,
const float* dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
using U = LayerNormParamType<T>;
int vec_size = MAX_CACHE_BYTES / sizeof(T);
if (this->cols_ % vec_size != 0) {
......@@ -368,7 +417,12 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
int threads = GetDesiredBlockDim(this->cols_ / vec_size);
int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
LaunchLayernormResidualDropoutBias<T,
MaskType,
U,
is_same_type,
InType,
OutType>(
this->rows_,
this->cols_,
increment,
......@@ -387,7 +441,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
out,
mean,
variance,
ctx);
ctx,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
}
template <typename P = LayerNormParamType<T>, bool is_same_type = false>
......
......@@ -418,7 +418,9 @@ template <typename T,
int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
int ROWS_PER_CTA = WARPS_M,
int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA,
typename InType = T,
typename OutType = T>
__global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
int rows,
int cols,
......@@ -428,7 +430,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
const bool is_test,
const uint64_t increment,
const float epsilon,
const T *__restrict__ x_ptr,
const InType *__restrict__ x_ptr,
const T *__restrict__ residual_ptr,
const T *__restrict__ bias_ptr,
const ScaleT *__restrict__ gamma_ptr,
......@@ -437,10 +439,20 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
U *__restrict__ mean_out_ptr,
U *__restrict__ var_out_ptr,
T *__restrict__ residual_out_ptr,
T *__restrict__ y_ptr) {
OutType *__restrict__ y_ptr,
const float quant_last_in_scale = 1.0,
const float *__restrict__ quant_out_scale_ptr = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
__shared__ U smem[WARPS_M * WARPS_N];
using Vec = phi::AlignedVector<T, VecSize>;
using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
using Vec_in_type = phi::AlignedVector<InType, VecSize>;
using Vec_out_type = phi::AlignedVector<OutType, VecSize>;
using Vec_float = phi::AlignedVector<float, VecSize>;
using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
const int tidx = threadIdx.x;
......@@ -481,12 +493,21 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
constexpr U rn = 1.f / U(ELTS_PER_ROW);
for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
Vec x[LDGS];
Vec_in_type x_input[LDGS];
Vec residual[LDGS];
Vec_float dequant_out_scale[LDGS];
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
&residual[it]);
phi::Load<InType, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize,
&x_input[it]);
if (quant_out_scale_ptr != nullptr) {
phi::Load<float, VecSize>(
quant_out_scale_ptr + quant_out_scale_offset + col * VecSize,
&dequant_out_scale[it]);
}
col += THREADS_PER_ROW;
}
......@@ -520,20 +541,42 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
#pragma unroll
for (int jt = 0; jt < VecSize; jt++) {
// dropout(x) + residual
x[it][jt] = (x[it][jt] + bias[it][jt]) *
if (std::is_same<InType, int32_t>::value) {
T tmp = (static_cast<T>(static_cast<float>(x_input[it][jt]) *
quant_last_in_scale /
dequant_out_scale[it][jt]) +
bias[it][jt]) *
static_cast<T>(mask_vec[it][jt]) * factor +
residual[it][jt];
x[it][jt] = tmp;
xf[it * VecSize + jt] = U(tmp);
} else {
x[it][jt] = (static_cast<T>(x_input[it][jt]) + bias[it][jt]) *
static_cast<T>(mask_vec[it][jt]) * factor +
residual[it][jt];
xf[it * VecSize + jt] = U(x[it][jt]);
}
}
}
} else {
#pragma unroll
for (int it = 0; it < LDGS; it++) {
#pragma unroll
for (int jt = 0; jt < VecSize; jt++) {
// dropout(x) + residual
x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
if (std::is_same<InType, int32_t>::value) {
// for int32 input, we need to dequantize.
T tmp = static_cast<T>(static_cast<float>(x_input[it][jt]) *
quant_last_in_scale /
dequant_out_scale[it][jt]) *
static_cast<T>(mask_vec[it][jt]) * factor +
residual[it][jt];
x[it][jt] = tmp;
} else {
x[it][jt] = static_cast<T>(x_input[it][jt]) *
static_cast<T>(mask_vec[it][jt]) * factor +
residual[it][jt];
}
xf[it * VecSize + jt] = U(x[it][jt]);
}
}
......@@ -626,6 +669,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
var_out_ptr[row] = var_local * rn;
}
Vec_out_type x_output[LDGS];
#pragma unroll
for (int it = 0; it < LDGS; it++) {
#pragma unroll
......@@ -638,12 +683,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
U tmp = rsigma * (static_cast<U>(xf[it * VecSize + jt]) - mu_local);
x[it][jt] = static_cast<T>(static_cast<U>(gamma[it][jt]) * tmp +
static_cast<U>(beta[it][jt]));
if (std::is_same<OutType, int8_t>::value)
x_output[it][jt] = quant_helper(x[it][jt],
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
}
}
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
if (std::is_same<OutType, int8_t>::value) {
phi::Store<OutType, VecSize>(
x_output[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
} else {
phi::Store<T, VecSize>(
x[it],
reinterpret_cast<T *>(y_ptr) + row * ELTS_PER_ROW + col * VecSize);
}
col += THREADS_PER_ROW;
}
}
......@@ -668,7 +727,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
template <typename T,
typename MaskType,
typename U,
bool ScaleBiasWithSameTypeX = false>
bool ScaleBiasWithSameTypeX = false,
typename InType = T,
typename OutType = T>
void LaunchLayernormResidualDropoutBias(
const uint32_t rows,
const uint32_t cols,
......@@ -678,18 +739,26 @@ void LaunchLayernormResidualDropoutBias(
const float epsilon,
const bool is_upscale_in_train,
const bool is_test,
const T *src,
const InType *src,
const T *residual,
const T *bias,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
MaskType *mask_data,
T *dst,
T *layernorm_dst,
OutType *layernorm_dst,
LayerNormParamType<T> *mean,
LayerNormParamType<T> *var,
const phi::GPUContext &ctx) {
const phi::GPUContext &ctx,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
// dropout_prob == 1.0f
// NOTE(minghaoBD): OutType should be T if drop_out_rate == 1.0
if (std::abs(dropout_prob - 1.0f) < 1e-5) {
auto cuda_place = ctx.GetPlace();
memory::Copy(cuda_place,
......@@ -705,10 +774,11 @@ void LaunchLayernormResidualDropoutBias(
switch (GetDesiredBlockDim(cols)) {
FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, U, kBlockDim, ScaleBiasWithSameTypeX>
<<<rows, kBlockDim, 0, ctx.stream()>>>(dst,
<<<rows, kBlockDim, 0, ctx.stream()>>>(
dst,
scale,
layernorm_bias,
layernorm_dst,
reinterpret_cast<T *>(layernorm_dst),
mean,
var,
epsilon,
......@@ -731,6 +801,9 @@ void LaunchLayernormResidualDropoutBias(
const int VecSize = BYTES_PER_LDG / sizeof(T); \
const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; \
const int ROWS_PER_CTA = WARPS_M; \
const int THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP; \
const int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW * VecSize; \
const int LDGS = cols / ELTS_PER_ROW_PER_CTA; \
const int grid = \
static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
fused_fast_ln_fwd_kernel< \
......@@ -742,7 +815,16 @@ void LaunchLayernormResidualDropoutBias(
WARPS_M, \
WARPS_N, \
BYTES_PER_LDG, \
cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows, \
cols, \
THREADS_PER_WARP, \
THREADS_PER_ROW, \
THREADS_PER_CTA, \
ROWS_PER_CTA, \
ELTS_PER_ROW_PER_CTA, \
LDGS, \
InType, \
OutType> \
<<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows, \
cols, \
seed, \
dropout_prob, \
......@@ -759,7 +841,14 @@ void LaunchLayernormResidualDropoutBias(
mean, \
var, \
dst, \
layernorm_dst); \
layernorm_dst, \
quant_last_in_scale, \
dequant_out_scale_data, \
quant_out_scale_offset, \
quant_next_in_scale, \
quant_round_type, \
quant_max_bound, \
quant_min_bound); \
} break
#define LAUNCH_FUSED_FAST_LN_KERNEL \
......@@ -784,7 +873,8 @@ void LaunchLayernormResidualDropoutBias(
if (cols % VecSize != 0) {
int blockDim = GetDesiredBlockDim(cols);
FusedLayernormResidualDropoutBias<T, uint8_t, 1, U, ScaleBiasWithSameTypeX>
<<<rows, blockDim, 0, ctx.stream()>>>(rows,
<<<rows, blockDim, 0, ctx.stream()>>>(
rows,
cols,
seed,
dropout_prob,
......@@ -792,14 +882,14 @@ void LaunchLayernormResidualDropoutBias(
is_test,
increment,
epsilon,
src,
reinterpret_cast<const T *>(src),
residual,
bias,
scale,
layernorm_bias,
mask_data,
dst,
layernorm_dst,
reinterpret_cast<T *>(layernorm_dst),
mean,
var);
} else {
......@@ -819,7 +909,8 @@ void LaunchLayernormResidualDropoutBias(
VecSize,
U,
ScaleBiasWithSameTypeX>
<<<rows, blockDim, 0, ctx.stream()>>>(rows,
<<<rows, blockDim, 0, ctx.stream()>>>(
rows,
cols,
seed,
dropout_prob,
......@@ -827,14 +918,14 @@ void LaunchLayernormResidualDropoutBias(
is_test,
increment,
epsilon,
src,
reinterpret_cast<const T *>(src),
residual,
bias,
scale,
layernorm_bias,
mask_data,
dst,
layernorm_dst,
reinterpret_cast<T *>(layernorm_dst),
mean,
var);
}
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
private:
static constexpr const char *OpName = "FusedMultiTransformerINT8Op";
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
#define CHECK_INPUT(name) \
OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
#define CHECK_INPUTS(name) \
OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
#define CHECK_OUTPUT(name) \
OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
#define CHECK_OUTPUTS(name) \
OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
CHECK_INPUT(X);
// attention
CHECK_INPUTS(QKVW);
CHECK_INPUTS(OutLinearW);
if (ctx->HasInput("TimeStep")) {
CHECK_INPUTS(CacheKV);
}
if (ctx->HasInputs("CacheKV")) {
CHECK_OUTPUTS(CacheKVOut);
}
// ffn
CHECK_INPUTS(FFN1Weight);
CHECK_INPUTS(FFN2Weight);
CHECK_OUTPUT(Out);
// x: qkv's input [batch_size, seq_len, dim_embed]
// y: qkv's weight: [3, num_head, dim_head, dim_embed]
auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputsDim("QKVW")[0];
bool trans_qkvw = ctx->Attrs().Get<bool>("trans_qkvw");
PADDLE_ENFORCE_EQ(
x_dim.size(),
3,
platform::errors::InvalidArgument("The dimensions of x must be 3"
"(batch_size, seq_len, dim_embed),"
"but received dimensions of"
"Input is [%d]",
x_dim.size()));
PADDLE_ENFORCE_EQ(y_dim.size(),
4,
platform::errors::InvalidArgument(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"but received dimensions of"
"Input is [%d]",
y_dim.size()));
PADDLE_ENFORCE_EQ(
x_dim[2],
trans_qkvw ? y_dim[3] : y_dim[0],
platform::errors::InvalidArgument(
"ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
"true) or y_dim[0](trans_qkvw is false)"
"must be equal. But received: the shape "
"of input x = [%s], and the shape of "
"input qkv_weight = [%s]",
x_dim,
y_dim));
if (ctx->Attrs().Get<int>("ring_id") == -1) {
if (trans_qkvw) {
PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
y_dim[3],
platform::errors::InvalidArgument(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"));
} else {
PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
y_dim[0],
platform::errors::InvalidArgument(
"The dimensions of qkv_weight must be 4"
"(dim_embed, 3, num_head, dim_head),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"));
}
}
if (ctx->HasInputs("CacheKV")) {
// [2, batch_size, num_head, max_seq_len, head_size]
const auto &c_dims = ctx->GetInputsDim("CacheKV");
const auto &c_dim = c_dims[0];
PADDLE_ENFORCE_EQ(
c_dim.size(),
5,
paddle::platform::errors::InvalidArgument(
"The CacheKV must be 5 dims, but got %d", c_dim.size()));
PADDLE_ENFORCE_EQ(c_dim[0],
2,
paddle::platform::errors::InvalidArgument(
"The first dim of CacheKV must be 2, but got %d",
c_dim[0])); // 2
PADDLE_ENFORCE_EQ(c_dim[1],
x_dim[0],
paddle::platform::errors::InvalidArgument(
"The second dim of CacheKV must be equal with "
"batch size %d, but got %d",
x_dim[0],
c_dim[1])); // batch_size
PADDLE_ENFORCE_EQ(c_dim[2],
trans_qkvw ? y_dim[1] : y_dim[2],
paddle::platform::errors::InvalidArgument(
"The third dim of CacheKV must be equal with num "
"head %d, but got %d",
trans_qkvw ? y_dim[1] : y_dim[2],
c_dim[2])); // num_head
PADDLE_ENFORCE_GT(
c_dim[3],
0,
paddle::platform::errors::InvalidArgument(
"The forth dim of CacheKV must be greater than 0, but got %d",
c_dim[3])); // cache_seq_len
PADDLE_ENFORCE_EQ(c_dim[4],
trans_qkvw ? y_dim[2] : y_dim[3],
paddle::platform::errors::InvalidArgument(
"The fifth dim of CacheKV must be equal with head "
"size %d, but got %d",
trans_qkvw ? y_dim[2] : y_dim[3],
c_dim[4])); // head_size
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name,
const Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
if (var_name == "TimeStep") {
VLOG(10) << "var_name:" << var_name << " need not to transform";
return expected_kernel_type;
}
return framework::OpKernelType(
expected_kernel_type.data_type_, tensor.place(), tensor.layout());
}
};
class FusedMultiTransformerINT8OpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input tensor.");
AddInput("LnScale",
"Scale is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor.")
.AsDuplicable();
AddInput("LnBias",
"Bias is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor.")
.AsDuplicable();
AddInput("QKVW", "The qkv weight tensor.").AsDuplicable();
AddInput("QKVBias", "The qkv bias tensor.").AsDispensable().AsDuplicable();
AddInput("CacheKV", "(optional) The cached KV for generation inference.")
.AsDispensable()
.AsDuplicable();
AddInput("TimeStep",
"(optional, int) The time step for generation inference.")
.AsDispensable();
AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
.AsDispensable();
AddInput("OutLinearW", "The out_linear weight tensor.").AsDuplicable();
AddInput("OutLinearBias", "The out_linear bias tensor.")
.AsDispensable()
.AsDuplicable();
AddInput("FFNLnScale", "The layer_norm scale of FusedFeedForward op")
.AsDuplicable();
AddInput("FFNLnBias", "The layer_norm bias of FusedFeedForward op")
.AsDuplicable();
AddInput("FFN1Weight", "The linear1 weight of FusedFeedForward op")
.AsDuplicable();
AddInput("FFN1Bias", "The linear1 bias of FusedFeedForward op")
.AsDispensable()
.AsDuplicable();
AddInput("FFN2Weight", "The linear2 weight of FusedFeedForward op")
.AsDuplicable();
AddInput("FFN2Bias", "The linear2 bias input of FusedFeedForward op")
.AsDispensable()
.AsDuplicable();
AddInput("QKVOutScale",
"QKVOutScale is used to dequantize qkv output tensor."
"In order to keep consistent with the PTQ/QAT calculation logic,"
"QKVOutScale should be max_bound * max_bound / max_range."
"Here max_range is per-channel weight scale."
"The shape of QKVOutScale is [num_layers, num_channels]")
.AsDispensable();
AddInput("OutLinearOutScale",
"OutLinearOutScale is used to dequantize out_linear output tensor."
"The definition and shape is the same as QKVOutScale")
.AsDispensable();
AddInput("FFN1OutScale",
"FFN1OutScale is used to dequantize ffn1 output tensor."
"The definition and shape is the same as QKVOutScale")
.AsDispensable();
AddInput("FFN2OutScale",
"FFN2OutScale is used to dequantize ffn2 output tensor."
"The definition and shape is the same as QKVOutScale")
.AsDispensable();
AddOutput("CacheKVOut", "The updated cache KV. Inplace with CacheKV")
.AsDispensable()
.AsDuplicable();
AddOutput("Out", "Result after multi .");
AddAttr<bool>("pre_layer_norm",
"if true, the attention op uses pre_layer_norm architecure, "
"else, uses post_layer_norm architecuture. "
"[default true].")
.SetDefault(true);
AddAttr<float>("epsilon",
"Constant for numerical stability [default 1e-5].")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
true,
platform::errors::InvalidArgument(
"'epsilon' in Op(LayerNorm) should be between"
"0.0 and 0.001, But received [%s].",
epsilon));
});
AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
.SetDefault(.5f)
.AddCustomChecker([](const float &drop_p) {
PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
true,
platform::errors::InvalidArgument(
"'dropout_rate' must be between 0.0 and 1.0."));
});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddAttr<std::string>(
"dropout_implementation",
"[\"downgrade_in_infer\"|\"upscale_in_train\"]"
"The meaning is the same as 'attn_dropout_implementation'.")
.SetDefault("downgrade_in_infer")
.AddCustomChecker([](const std::string &type) {
PADDLE_ENFORCE_EQ(
type == "downgrade_in_infer" || type == "upscale_in_train",
true,
platform::errors::InvalidArgument(
"dropout_implementation can only be downgrade_in_infer or "
"upscale_in_train"));
});
AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
AddAttr<bool>(
"trans_qkvw",
"Whether the weights of qkv should be transposed. If true,"
"the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
"Otherwise the shape of weights of qkv should be"
"[dim_embed, 3, num_head, dim_head]")
.SetDefault(true);
AddAttr<int>(
"ring_id",
"ring id for tensor model parallel. distributed training and inference")
.SetDefault(-1);
AddAttr<int>("num_head", "num_head").SetDefault(0);
AddAttr<int>("dim_head", "dim_head").SetDefault(0);
AddAttr<int>("dim_ffn", "dim_ffn").SetDefault(0);
AddAttr<std::vector<float>>(
"qkv_in_scale",
"qkv_in_scale is used to quantize qkv input tensor."
"in_scale is generated by PTQ or QAT, which represents valid max range "
"of this tensor."
"the size of qkv_in_scale should be num_layers, which is equal to "
"QKVW.dims()[0]")
.SetDefault({});
AddAttr<std::vector<float>>(
"out_linear_in_scale",
"out_linear_in_scale is used to quantize out_linear input tensor."
"the size of out_linear_in_scale is the same as qkv_in_scale")
.SetDefault({});
AddAttr<std::vector<float>>(
"ffn1_in_scale",
"ffn1_in_scale is used to quantize ffn1 input tensor."
"the size of ffn1_in_scale is the same as qkv_in_scale")
.SetDefault({});
AddAttr<std::vector<float>>(
"ffn2_in_scale",
"ffn2_in_scale is used to quantize ffn2 input tensor."
"the size of ffn2_in_scale is the same as qkv_in_scale")
.SetDefault({});
AddAttr<int>(
"quant_round_type",
"(int, default 1) The round type of fp32 to int."
"0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
"1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
"round(-2.5)=-3")
.SetDefault(1);
AddAttr<float>(
"quant_max_bound",
"(float, default 127.0) the max bound of float type to int type")
.SetDefault(127.0);
AddAttr<float>(
"quant_min_bound",
"(float, default -127.0) the min bound of float type to int type")
.SetDefault(-127.0);
AddComment(R"DOC(fused multi transformer layers op)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
fused_multi_transformer_int8,
ops::FusedMultiTransformerINT8Op,
ops::FusedMultiTransformerINT8OpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
......@@ -28,7 +28,9 @@ template <typename T,
int VecSize,
bool ComputeLayerNorm,
bool Activation,
typename Functor>
typename Functor,
typename InType = T,
typename OutType = T>
__forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
const int row_id,
const int col_id,
......@@ -36,30 +38,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
curandStatePhilox4_32_10_t *state,
const float dropout_prob,
const T factor,
const T *__restrict__ src,
const InType *__restrict__ src,
const T *__restrict__ residual,
const T *__restrict__ bias,
T *dst,
OutType *dst,
MaskType *mask,
const bool is_test,
typename details::MPTypeTrait<T>::Type *mean_val,
typename details::MPTypeTrait<T>::Type *var_val,
Functor act_func) {
Functor act_func,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
using LoadT = phi::AlignedVector<T, VecSize>;
using LoadInType = phi::AlignedVector<InType, VecSize>;
using LoadFloat = phi::AlignedVector<float, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using StoreOutType = phi::AlignedVector<OutType, VecSize>;
using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
using U = typename details::MPTypeTrait<T>::Type;
LoadT src_vec;
LoadInType src_vec;
LoadT residual_vec;
LoadT bias_vec;
LoadFloat quant_out_scale_vec;
#pragma unroll
for (int ii = 0; ii < VecSize; ii++) {
bias_vec[ii] = static_cast<T>(0);
residual_vec[ii] = static_cast<T>(0);
}
// vectorize load data from global
phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
phi::Load<InType, VecSize>(&src[row_id * cols + col_id], &src_vec);
phi::Load<float, VecSize>(
&dequant_out_scale_data[quant_out_scale_offset + col_id],
&quant_out_scale_vec);
if (residual) {
phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
}
......@@ -84,10 +101,18 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
}
StoreT dest_vec;
StoreOutType dest_vec_out_type;
#pragma unroll
for (int ii = 0; ii < VecSize; ii++) {
T tmp = src_vec[ii] + bias_vec[ii];
T tmp;
if (std::is_same<InType, int32_t>::value) {
T tmp0 = static_cast<T>(static_cast<float>(src_vec[ii]) *
quant_last_in_scale / quant_out_scale_vec[ii]);
tmp = tmp0 + bias_vec[ii];
} else {
tmp = static_cast<T>(src_vec[ii]) + bias_vec[ii];
}
if (Activation) {
tmp = act_func(tmp);
}
......@@ -98,10 +123,23 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
*mean_val += tmp;
*var_val += (tmp * tmp);
}
if (std::is_same<OutType, int8_t>::value) {
dest_vec_out_type[ii] = quant_helper(dest_vec[ii],
quant_next_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
}
}
// store result to global
phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
if (std::is_same<OutType, int8_t>::value) {
phi::Store<OutType, VecSize>(dest_vec_out_type,
&dst[row_id * cols + col_id]);
} else {
phi::Store<T, VecSize>(dest_vec,
reinterpret_cast<T *>(&dst[row_id * cols + col_id]));
}
if (!is_test) {
phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
}
......@@ -114,19 +152,28 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
* is_test: only used in inference
* mask: can be null if is_test=true
*/
template <typename T, typename MaskType, int VecSize>
__global__ void FusedResidualDropoutBias(const size_t rows,
template <typename T,
typename MaskType,
int VecSize,
typename InType = T,
typename OutType = T>
__global__ void FusedResidualDropoutBias(
const size_t rows,
const size_t cols,
uint64_t seed,
const float dropout_prob,
const bool is_upscale_in_train,
const T *__restrict__ src,
const InType *__restrict__ src,
const T *__restrict__ residual,
const T *__restrict__ bias,
MaskType *mask,
T *dst,
OutType *dst,
uint64_t increment,
const bool is_test) {
const bool is_test,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0) {
int col_id = blockDim.x * blockIdx.x + threadIdx.x;
int row_id = blockIdx.y;
int idx = row_id * cols + col_id;
......@@ -142,8 +189,9 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
VecSize,
false,
false,
phi::funcs::ReluFunctor<T>>(
r,
phi::funcs::ReluFunctor<T>,
InType,
OutType>(r,
i,
cols,
&state,
......@@ -157,7 +205,11 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
is_test,
nullptr,
nullptr,
relu);
relu,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
}
}
}
......@@ -165,7 +217,10 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
/**
* @brief dst = residual + dropout(src + bias);
*/
template <typename T, typename MaskType>
template <typename T,
typename MaskType,
typename InType = T,
typename OutType = T>
void LaunchResidualDropoutBias(const uint32_t rows,
const uint32_t cols,
const int increment,
......@@ -173,14 +228,19 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const float dropout_prob,
const bool is_test,
bool is_upscale_in_train,
const T *src,
const InType *src,
const T *residual,
const T *bias,
MaskType *mask_data,
T *dst,
const phi::GPUContext &ctx) {
OutType *dst,
const phi::GPUContext &ctx,
const float quant_last_in_scale = 1.0,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_next_in_scale = 1.0) {
// dropout_prob == 1.0f
if (std::abs(dropout_prob - 1.0f) < 1e-5) {
// NOTE(minghaoBD): OutType should be T if dropout_prob == 1.0
if (residual == dst) return;
if (residual) {
memory::Copy(ctx.GetPlace(),
......@@ -202,7 +262,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
if (cols % VecSize == 0) {
FusedResidualDropoutBias<T, uint8_t, VecSize>
FusedResidualDropoutBias<T, uint8_t, VecSize, InType, OutType>
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
rows,
cols,
......@@ -215,9 +275,13 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data,
dst,
increment,
is_test);
is_test,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
} else {
FusedResidualDropoutBias<T, uint8_t, 1>
FusedResidualDropoutBias<T, uint8_t, 1, InType, OutType>
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
rows,
cols,
......@@ -230,7 +294,11 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data,
dst,
increment,
is_test);
is_test,
quant_last_in_scale,
dequant_out_scale_data,
quant_out_scale_offset,
quant_next_in_scale);
}
}
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
__forceinline__ __device__ int8_t quant_helper(const T input,
const float scale,
const int round_type,
const float max_bound,
const float min_bound) {
float quant_value = max_bound * inverse(scale) * static_cast<float>(input);
if (round_type == 0) {
quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
} else {
quant_value = static_cast<float>(round(quant_value));
}
quant_value = quant_value > max_bound ? max_bound : quant_value;
quant_value = quant_value < min_bound ? min_bound : quant_value;
return static_cast<int8_t>(quant_value);
}
template <typename T>
__global__ void quantize_kernel(const T* input,
char4* output,
const float scale,
const int m,
const int n,
const int round_type,
const float max_bound,
const float min_bound) {
int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
int m_id = blockIdx.y * blockDim.y + threadIdx.y;
bool check = ((m_id < m) && (n_id < n));
if (check) {
char4 tmp;
tmp.x = quant_helper(
input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
tmp.y = quant_helper(
input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
tmp.z = quant_helper(
input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
tmp.w = quant_helper(
input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
output[(m_id * n + n_id) >> 2] = tmp;
}
}
template <typename T>
void quantize_kernel_launcher(const T* input,
int8_t* output,
const float scale,
const int m,
const int n,
const int round_type,
const float max_bound,
const float min_bound,
gpuStream_t stream) {
// TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
dim3 grid((n + 31) / 32, (m + 31) / 32);
dim3 block(32, 32);
quantize_kernel<<<grid, block, 0, stream>>>(input,
(char4*)output, // NOLINT
scale,
m,
n,
round_type,
max_bound,
min_bound);
}
// dequantize using weight scales and input scales
template <typename T>
__global__ void dequantize_kernel(T* output,
const int32_t* input,
const int m, // hidden
const int n, // batch size
const float quant_in_scale,
const float* dequant_out_scale_data,
const int quant_out_scale_offset) {
int m_id = blockIdx.x * blockDim.x + threadIdx.x; // hidden
int n_id = blockIdx.y * blockDim.y + threadIdx.y; // batch size
bool check = ((m_id < m) && (n_id < n));
if (check) {
float out_scale = dequant_out_scale_data[quant_out_scale_offset + m_id];
output[n_id * m + m_id] =
static_cast<T>(static_cast<float>(input[n_id * m + m_id]) *
quant_in_scale / out_scale);
}
}
template <typename T>
void dequantize_kernel_launcher(const int32_t* input,
T* output,
const int batch_size, // m
const int hidden_units, // n
gpuStream_t stream,
const float quant_in_scale,
const float* dequant_out_scale_data,
const int quant_out_scale_offset) {
dim3 grid((hidden_units + 31) / 32, (batch_size + 31) / 32);
dim3 block(32, 32);
dequantize_kernel<<<grid, block, 0, stream>>>(output,
input,
hidden_units,
batch_size,
quant_in_scale,
dequant_out_scale_data,
quant_out_scale_offset);
}
} // namespace operators
} // namespace paddle
......@@ -24,6 +24,7 @@ namespace cub = hipcub;
#include <iostream>
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/core/ddim.h"
......@@ -338,16 +339,24 @@ using LayerNormScaleBiasT =
template <typename T,
typename U,
int BlockDim,
bool ScaleBiasWithSameTypeX = false>
bool ScaleBiasWithSameTypeX = false,
typename InType = T,
typename OutType = T>
__global__ void LayerNormForward(
const T *x,
const InType *x,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias,
T *y,
OutType *y,
U *mean,
U *var,
float epsilon,
int64_t feature_size) {
int64_t feature_size,
const float *dequant_out_scale_data = nullptr,
const int quant_out_scale_offset = 0,
const float quant_in_scale = 1.0,
const int quant_round_type = 1,
const float quant_max_bound = 127.0,
const float quant_min_bound = -127.0) {
__shared__ U mean_share;
__shared__ U var_share;
__shared__ U shared_mean[32]; // threadIdx.x / warpSize <= kMaxBlockDim /
......@@ -387,28 +396,72 @@ __global__ void LayerNormForward(
if (bias != nullptr) {
for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>(static_cast<U>(scale[j]) *
if (std::is_same<OutType, int8_t>::value) {
y[i] = quant_helper(
static_cast<T>(static_cast<U>(scale[j]) *
(static_cast<U>(x[i]) - mean_val) * invvar +
static_cast<U>(bias[j])),
quant_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else {
y[i] = static_cast<OutType>(static_cast<U>(scale[j]) *
(static_cast<U>(x[i]) - mean_val) *
invvar +
static_cast<U>(bias[j]));
}
}
} else {
for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>(static_cast<U>(scale[j]) *
if (std::is_same<OutType, int8_t>::value) {
y[i] = quant_helper(
static_cast<T>(static_cast<U>(scale[j]) *
(static_cast<U>(x[i]) - mean_val) * invvar),
quant_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else {
y[i] =
static_cast<OutType>(static_cast<U>(scale[j]) *
(static_cast<U>(x[i]) - mean_val) * invvar);
}
}
}
} else { // scale == nullptr
if (bias != nullptr) {
for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
if (std::is_same<OutType, int8_t>::value) {
y[i] = quant_helper(
static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
static_cast<U>(bias[j])),
quant_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else {
y[i] =
static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar +
static_cast<U>(bias[j]));
}
}
} else {
for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
i += BlockDim, j += BlockDim) {
y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
if (std::is_same<OutType, int8_t>::value) {
y[i] = quant_helper(
static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar),
quant_in_scale,
quant_round_type,
quant_max_bound,
quant_min_bound);
} else {
y[i] =
static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar);
}
}
}
}
......
......@@ -59,7 +59,9 @@ namespace dynload {
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
......
......@@ -73,7 +73,9 @@ extern void *cublasLt_dso_handle;
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
// #endif
......
......@@ -326,7 +326,7 @@ void* GetCublasDsoHandle() {
void* GetCublasLtDsoHandle() {
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10010
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
#else
std::string warning_msg(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册