Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3d7e2118
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3d7e2118
编写于
9月 18, 2022
作者:
R
RichardWooSJTU
提交者:
GitHub
9月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add INT8 support for fused_multi_transformer_op (#45284)
上级
7f346a76
变更
22
展开全部
显示空白变更内容
内联
并排
Showing
22 changed file
with
4168 addition
and
1428 deletion
+4168
-1428
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+2
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+2
-0
paddle/fluid/operators/fused/attention_layer_norm.h
paddle/fluid/operators/fused/attention_layer_norm.h
+24
-6
paddle/fluid/operators/fused/attn_gemm_int8.h
paddle/fluid/operators/fused/attn_gemm_int8.h
+189
-0
paddle/fluid/operators/fused/cublaslt.h
paddle/fluid/operators/fused/cublaslt.h
+211
-0
paddle/fluid/operators/fused/fused_dropout_act_bias.h
paddle/fluid/operators/fused/fused_dropout_act_bias.h
+65
-24
paddle/fluid/operators/fused/fused_dropout_common.h
paddle/fluid/operators/fused/fused_dropout_common.h
+1
-0
paddle/fluid/operators/fused/fused_dropout_helper.h
paddle/fluid/operators/fused/fused_dropout_helper.h
+116
-55
paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
...d/operators/fused/fused_layernorm_residual_dropout_bias.h
+188
-97
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
.../fluid/operators/fused/fused_multi_transformer_int8_op.cc
+369
-0
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
.../fluid/operators/fused/fused_multi_transformer_int8_op.cu
+670
-0
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+5
-1147
paddle/fluid/operators/fused/fused_multi_transformer_op.h
paddle/fluid/operators/fused/fused_multi_transformer_op.h
+1161
-0
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+113
-45
paddle/fluid/operators/fused/quant_dequant_kernel.h
paddle/fluid/operators/fused/quant_dequant_kernel.h
+136
-0
paddle/fluid/operators/layer_norm_kernel.cu.h
paddle/fluid/operators/layer_norm_kernel.cu.h
+65
-12
paddle/fluid/platform/dynload/cublasLt.h
paddle/fluid/platform/dynload/cublasLt.h
+22
-20
paddle/fluid/pybind/op_function_generator.h
paddle/fluid/pybind/op_function_generator.h
+8
-0
paddle/phi/backends/dynload/cublasLt.h
paddle/phi/backends/dynload/cublasLt.h
+22
-20
paddle/phi/backends/dynload/dynamic_loader.cc
paddle/phi/backends/dynload/dynamic_loader.cc
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+6
-0
python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
...d/tests/unittests/test_fused_multi_transformer_int8_op.py
+792
-0
未找到文件。
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
3d7e2118
...
...
@@ -165,7 +165,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
auto
var_data_type
=
var_node
->
Var
()
->
GetDataType
();
VLOG
(
5
)
<<
"var_name is "
<<
var_name
<<
", data type is "
<<
var_data_type
;
if
(
var_data_type
==
paddle
::
framework
::
proto
::
VarType
::
FP16
)
{
if
(
var_data_type
==
paddle
::
framework
::
proto
::
VarType
::
FP16
&&
t
->
dtype
()
!=
paddle
::
experimental
::
DataType
::
FLOAT16
)
{
framework
::
Tensor
half_tensor
;
half_tensor
.
set_type
(
paddle
::
experimental
::
DataType
::
FLOAT16
);
half_tensor
.
Resize
(
t
->
dims
());
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
3d7e2118
...
...
@@ -23,6 +23,7 @@ register_operators(
fused_transformer_op
fused_feedforward_op
fused_multi_transformer_op
fused_multi_transformer_int8_op
fused_bias_dropout_residual_layer_norm_op
resnet_unit_op
fused_gemm_epilogue_op
...
...
@@ -119,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
# fused_attention_op
op_library
(
fused_attention_op
)
op_library
(
fused_multi_transformer_op
)
op_library
(
fused_multi_transformer_int8_op
)
op_library
(
fused_bias_dropout_residual_layer_norm_op
)
endif
()
# resnet_unit needs cudnn 8.0 above
...
...
paddle/fluid/operators/fused/attention_layer_norm.h
浏览文件 @
3d7e2118
...
...
@@ -19,7 +19,8 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
// NOTE: T must be the same as OutType in ComputeBackward
template
<
typename
T
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
AttnLayerNorm
{
public:
AttnLayerNorm
(
const
phi
::
GPUContext
&
dev_ctx
,
...
...
@@ -33,17 +34,28 @@ class AttnLayerNorm {
~
AttnLayerNorm
()
{}
void
ComputeForward
(
const
T
*
x_data
,
void
ComputeForward
(
const
InType
*
x_data
,
const
LayerNormParamType
<
T
>*
scale_data
,
const
LayerNormParamType
<
T
>*
bias_data
,
T
*
y_data
,
OutType
*
y_data
,
LayerNormParamType
<
T
>*
mean_data
,
LayerNormParamType
<
T
>*
var_data
)
{
LayerNormParamType
<
T
>*
var_data
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
auto
stream
=
dev_ctx_
.
stream
();
switch
(
GetDesiredBlockDim
(
feature_size_
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
LayerNormParamType
<
T
>
,
kBlockDim
>
LayerNormForward
<
T
,
LayerNormParamType
<
T
>
,
kBlockDim
,
false
,
InType
,
OutType
>
<<<
batch_size_
,
kBlockDim
,
0
,
stream
>>>
(
x_data
,
scale_data
,
bias_data
,
...
...
@@ -51,7 +63,13 @@ class AttnLayerNorm {
mean_data
,
var_data
,
epsilon_
,
feature_size_
));
feature_size_
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
));
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Feature_size must be larger than 1"
));
...
...
paddle/fluid/operators/fused/attn_gemm_int8.h
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <vector>
#include "paddle/fluid/operators/fused/cublaslt.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
class
AttnMatmulINT8
{
public:
AttnMatmulINT8
(
const
phi
::
GPUContext
&
dev_ctx
,
int
m
,
int
n
,
int
k
,
bool
compute_bias
)
:
dev_ctx_
(
dev_ctx
),
m_
(
m
),
n_
(
n
),
k_
(
k
),
compute_bias_
(
compute_bias
)
{
auto
helper
=
std
::
make_shared
<
CublasLtHelper
>
(
m
,
k
,
n
);
helpers_
.
emplace_back
(
helper
);
}
~
AttnMatmulINT8
()
{}
// This function is used to execute GEMM, with input and output's types are
// both T.
void
ComputeForward
(
const
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
input_tmp
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output_tmp
,
framework
::
Tensor
*
bias_out
,
const
float
quant_in_scale
,
const
framework
::
Tensor
*
dequant_out_scale
,
const
int
quant_out_scale_offset
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
quantize_kernel_launcher
<
T
>
(
input
->
data
<
T
>
(),
input_tmp
->
data
<
int8_t
>
(),
quant_in_scale
,
m_
,
k_
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
,
dev_ctx_
.
stream
());
helpers_
[
0
]
->
GEMM
(
input_tmp
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output_tmp
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
dequantize_kernel_launcher
<
T
>
(
output_tmp
->
data
<
int32_t
>
(),
output
->
data
<
T
>
(),
m_
,
n_
,
dev_ctx_
.
stream
(),
quant_in_scale
,
dequant_out_scale
->
data
<
float
>
(),
quant_out_scale_offset
);
if
(
compute_bias_
)
{
// bias_out = output + bias
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
output
,
bias
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
bias_out
};
phi
::
funcs
::
BroadcastKernel
<
phi
::
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
PADDLE_ENFORCE_EQ
(
cudaGetLastError
(),
cudaSuccess
,
platform
::
errors
::
Fatal
(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"
));
}
}
// This function is used to execute GEMM, with input and output's types are
// both INT8.
void
ComputeForwardINT8ToINT8
(
const
framework
::
Tensor
*
weight
,
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
bias_out
)
{
helpers_
[
0
]
->
GEMM
(
input
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
}
// This function is used to execute GEMM, with input and output's types are
// INT8 and T.
void
ComputeForwardINT8ToT
(
const
framework
::
Tensor
*
weight
,
const
float
quant_in_scale
,
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output_tmp
,
framework
::
Tensor
*
bias_out
,
const
framework
::
Tensor
*
dequant_out_scale
,
const
int
quant_out_scale_offset
)
{
helpers_
[
0
]
->
GEMM
(
input
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output_tmp
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
dequantize_kernel_launcher
<
T
>
(
output_tmp
->
data
<
int32_t
>
(),
output
->
data
<
T
>
(),
m_
,
n_
,
dev_ctx_
.
stream
(),
quant_in_scale
,
dequant_out_scale
->
data
<
float
>
(),
quant_out_scale_offset
);
if
(
compute_bias_
)
{
// bias_out = output + bias
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
output
,
bias
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
bias_out
};
phi
::
funcs
::
BroadcastKernel
<
phi
::
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
PADDLE_ENFORCE_EQ
(
cudaGetLastError
(),
cudaSuccess
,
platform
::
errors
::
Fatal
(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"
));
}
}
// This function is used to execute GEMM, with input and output's types are T
// and INT8.
void
ComputeForwardTToINT8
(
const
framework
::
Tensor
*
weight
,
const
float
quant_in_scale
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
input_tmp
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
bias_out
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
quantize_kernel_launcher
<
T
>
(
input
->
data
<
T
>
(),
input_tmp
->
data
<
int8_t
>
(),
quant_in_scale
,
m_
,
k_
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
,
dev_ctx_
.
stream
());
helpers_
[
0
]
->
GEMM
(
input_tmp
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
}
private:
const
phi
::
GPUContext
&
dev_ctx_
;
int
m_
;
// m
int
n_
;
// n
int
k_
;
// k
int
compute_bias_
;
std
::
vector
<
std
::
shared_ptr
<
CublasLtHelper
>>
helpers_
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/cublaslt.h
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sstream>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
namespace
dyl
=
paddle
::
platform
::
dynload
;
namespace
paddle
{
namespace
operators
{
class
CublasLtHelper
{
public:
CublasLtHelper
(
int
m
,
int
k
,
int
n
)
:
alpha_
(
1
),
beta_
(
0
),
m_
(
m
),
k_
(
k
),
n_
(
n
)
{
cublasStatus_t
status
;
// handle and matmul desc
status
=
dyl
::
cublasLtCreate
(
&
handle_
);
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t
cudaComputeType
=
CUDA_R_32I
;
#else
cublasComputeType_t
cudaComputeType
=
CUBLAS_COMPUTE_32I
;
#endif
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
#if CUBLAS_VER_MAJOR < 11
status
=
dyl
::
cublasLtMatmulDescCreate
(
&
matmul_desc_
,
cudaComputeType
);
#else
status
=
dyl
::
cublasLtMatmulDescCreate
(
&
matmul_desc_
,
cudaComputeType
,
CUDA_R_32I
);
#endif
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmulDescCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
cublasOperation_t
op_transpose
=
CUBLAS_OP_T
;
status
=
dyl
::
cublasLtMatmulDescSetAttribute
(
matmul_desc_
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
op_transpose
,
sizeof
(
op_transpose
));
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmulDescSetAttribute execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
// matrix desc
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
B_desc_
,
CUDA_R_8I
,
k
,
n
,
k
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
A_desc_
,
CUDA_R_8I
,
k
,
m
,
k
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
C_desc_
,
CUDA_R_32I
,
n
,
m
,
n
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
}
~
CublasLtHelper
()
{
if
(
handle_
)
dyl
::
cublasLtDestroy
(
handle_
);
if
(
matmul_desc_
)
dyl
::
cublasLtMatmulDescDestroy
(
matmul_desc_
);
if
(
A_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
A_desc_
);
if
(
B_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
B_desc_
);
if
(
C_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
C_desc_
);
}
void
GEMM
(
int8_t
*
A_dev
,
const
int8_t
*
B_dev
,
int32_t
*
C_dev
,
cudaStream_t
stream
)
{
cublasStatus_t
status
;
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
cublasLtMatmulAlgo_t
algo
;
int
algoId
=
21
;
int
swizzle
=
0
;
int
customOption
=
0
;
int
tile
=
15
;
int
splitK_val
=
0
;
int
reductionScheme
=
0
;
#if CUDA_VERSION >= 11000
int
stages
=
23
;
#endif
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t
cudaComputeType
=
CUDA_R_32I
;
#else
cublasComputeType_t
cudaComputeType
=
CUBLAS_COMPUTE_32I
;
#endif
dyl
::
cublasLtMatmulAlgoInit
(
handle_
,
cudaComputeType
,
CUDA_R_32I
,
CUDA_R_8I
,
CUDA_R_8I
,
CUDA_R_32I
,
CUDA_R_32I
,
algoId
,
&
algo
);
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
,
&
(
customOption
),
sizeof
(
customOption
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_TILE_ID
,
&
(
tile
),
sizeof
(
tile
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM
,
&
(
splitK_val
),
sizeof
(
splitK_val
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
#if CUDA_VERSION >= 11000
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
stages
),
sizeof
(
stages
));
#endif
#endif
status
=
dyl
::
cublasLtMatmul
(
handle_
,
matmul_desc_
,
&
alpha_
,
B_dev
,
B_desc_
,
A_dev
,
A_desc_
,
&
beta_
,
C_dev
,
C_desc_
,
C_dev
,
C_desc_
,
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
&
algo
,
#else
nullptr
,
#endif
nullptr
,
0
,
stream
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmul execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
}
private:
cublasLtHandle_t
handle_
;
cublasLtMatmulDesc_t
matmul_desc_
;
cublasLtMatrixLayout_t
A_desc_
;
cublasLtMatrixLayout_t
B_desc_
;
cublasLtMatrixLayout_t
C_desc_
;
int32_t
alpha_
;
int32_t
beta_
;
int
m_
;
int
k_
;
int
n_
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fused_dropout_act_bias.h
浏览文件 @
3d7e2118
...
...
@@ -60,8 +60,14 @@ struct GeluGradFunctor {
* the src, mask and dst shape is (rows, cols)
* the bias shape is (1, cols)
*/
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
Functor
>
__global__
void
FusedDropoutActBias
(
Functor
act
,
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
void
FusedDropoutActBias
(
Functor
act
,
const
uint64_t
seed
,
const
uint64_t
rows
,
const
uint64_t
cols
,
...
...
@@ -69,10 +75,17 @@ __global__ void FusedDropoutActBias(Functor act,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
T
*
__restrict__
src
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
bias
,
T
*
dst
,
MaskType
*
mask
)
{
OutType
*
dst
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
int
col_id
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
row_id
=
blockIdx
.
y
;
int
idx
=
row_id
*
cols
+
col_id
;
...
...
@@ -90,7 +103,9 @@ __global__ void FusedDropoutActBias(Functor act,
VecSize
,
false
,
true
,
Functor
>
(
r
,
Functor
,
InType
,
OutType
>
(
r
,
i
,
cols
,
&
state
,
...
...
@@ -104,7 +119,14 @@ __global__ void FusedDropoutActBias(Functor act,
is_test
,
nullptr
,
nullptr
,
act
);
act
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
}
}
...
...
@@ -112,7 +134,11 @@ __global__ void FusedDropoutActBias(Functor act,
/**
* @brief dst = dropout(activation(src + bias));
*/
template
<
typename
T
,
typename
MaskType
,
typename
Functor
>
template
<
typename
T
,
typename
MaskType
,
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
void
LaunchDropoutActBias
(
Functor
act_functor
,
const
uint64_t
seed
,
const
uint32_t
rows
,
...
...
@@ -121,14 +147,21 @@ void LaunchDropoutActBias(Functor act_functor,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
bias
,
T
*
dst
,
OutType
*
dst
,
MaskType
*
mask_data
,
const
phi
::
GPUContext
&
ctx
)
{
const
phi
::
GPUContext
&
ctx
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
// dropout_prob == 1.0f
if
(
std
::
abs
(
dropout_prob
-
1.0
f
)
<
1e-5
)
{
SetZero
<
T
>
(
ctx
,
dst
,
rows
*
cols
);
SetZero
<
T
>
(
ctx
,
reinterpret_cast
<
T
*>
(
dst
)
,
rows
*
cols
);
SetZero
<
MaskType
>
(
ctx
,
mask_data
,
rows
*
cols
);
return
;
}
...
...
@@ -137,7 +170,7 @@ void LaunchDropoutActBias(Functor act_functor,
const
int
real_vec_size
=
cols
%
VecSize
==
0
?
VecSize
:
1
;
const
auto
config
=
Get1DBlocksAnd2DGrids
(
ctx
,
rows
,
cols
,
real_vec_size
);
if
(
cols
%
VecSize
==
0
)
{
FusedDropoutActBias
<
T
,
MaskType
,
VecSize
,
Functor
>
FusedDropoutActBias
<
T
,
MaskType
,
VecSize
,
Functor
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
act_functor
,
seed
,
...
...
@@ -150,9 +183,13 @@ void LaunchDropoutActBias(Functor act_functor,
src
,
bias
,
dst
,
mask_data
);
mask_data
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
else
{
FusedDropoutActBias
<
T
,
MaskType
,
1
,
Functor
>
FusedDropoutActBias
<
T
,
MaskType
,
1
,
Functor
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
act_functor
,
seed
,
...
...
@@ -165,7 +202,11 @@ void LaunchDropoutActBias(Functor act_functor,
src
,
bias
,
dst
,
mask_data
);
mask_data
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
...
...
paddle/fluid/operators/fused/fused_dropout_common.h
浏览文件 @
3d7e2118
...
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
...
...
paddle/fluid/operators/fused/fused_dropout_helper.h
浏览文件 @
3d7e2118
...
...
@@ -109,7 +109,10 @@ struct DropoutParam {
}
};
template
<
typename
T
,
typename
MaskType
>
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
FusedDropoutHelper
{
private:
int
GetIncrement
(
const
phi
::
GPUContext
&
ctx
)
{
...
...
@@ -140,13 +143,18 @@ class FusedDropoutHelper {
// out = residual + dropout( src + bias )
void
ResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
T
*
out
,
MaskType
*
mask
)
{
OutType
*
out
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
auto
increment
=
GetIncrement
(
ctx
);
LaunchResidualDropoutBias
<
T
,
MaskType
>
(
rows_
,
LaunchResidualDropoutBias
<
T
,
MaskType
,
InType
,
OutType
>
(
rows_
,
cols_
,
increment
,
dropout_param_
.
seed
,
...
...
@@ -158,7 +166,11 @@ class FusedDropoutHelper {
bias
,
mask
,
out
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
void
ResidualDropoutBiasGrad
(
const
phi
::
GPUContext
&
ctx
,
...
...
@@ -189,15 +201,22 @@ class FusedDropoutHelper {
// out = dropout(activation(src + bias))
void
DropoutActBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
bias
,
const
std
::
string
&
act_method
,
T
*
out
,
MaskType
*
mask
)
{
OutType
*
out
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
auto
increment
=
GetIncrement
(
ctx
);
if
(
act_method
==
"gelu"
)
{
GeluFunctor
<
T
>
gelu
;
LaunchDropoutActBias
<
T
,
MaskType
,
GeluFunctor
<
T
>>
(
LaunchDropoutActBias
<
T
,
MaskType
,
GeluFunctor
<
T
>
,
InType
,
OutType
>
(
gelu
,
dropout_param_
.
seed
,
rows_
,
...
...
@@ -210,11 +229,21 @@ class FusedDropoutHelper {
bias
,
out
,
mask
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
if
(
act_method
==
"relu"
)
{
phi
::
funcs
::
ReluFunctor
<
T
>
relu
;
LaunchDropoutActBias
<
T
,
MaskType
,
phi
::
funcs
::
ReluFunctor
<
T
>>
(
relu
,
LaunchDropoutActBias
<
T
,
MaskType
,
phi
::
funcs
::
ReluFunctor
<
T
>
,
InType
,
OutType
>
(
relu
,
dropout_param_
.
seed
,
rows_
,
cols_
,
...
...
@@ -226,7 +255,14 @@ class FusedDropoutHelper {
bias
,
out
,
mask
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Currently only supports gelu or relu activation functions!"
));
...
...
@@ -283,8 +319,12 @@ class FusedDropoutHelper {
DropoutParam
dropout_param_
;
};
template
<
typename
T
,
typename
MaskType
>
class
FusedDropoutLayerNormHelper
:
public
FusedDropoutHelper
<
T
,
MaskType
>
{
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
FusedDropoutLayerNormHelper
:
public
FusedDropoutHelper
<
T
,
MaskType
,
InType
,
OutType
>
{
public:
FusedDropoutLayerNormHelper
()
{}
FusedDropoutLayerNormHelper
(
const
int
rows
,
...
...
@@ -301,23 +341,24 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
const
int
cols
,
const
DropoutParam
&
dropout_param
,
const
float
epsilon
)
:
FusedDropoutHelper
<
T
,
MaskType
>
(
ctx
,
rows
,
cols
,
dropout_param
)
{
:
FusedDropoutHelper
<
T
,
MaskType
,
InType
,
OutType
>
(
ctx
,
rows
,
cols
,
dropout_param
)
{
using
U
=
LayerNormParamType
<
T
>
;
epsilon_
=
epsilon
;
}
// call layer_norm
void
LayerNorm
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
LayerNormParamType
<
T
>*
gamma
,
const
LayerNormParamType
<
T
>*
beta
,
T
*
out
,
OutType
*
out
,
LayerNormParamType
<
T
>*
mean
,
LayerNormParamType
<
T
>*
variance
)
{
using
U
=
LayerNormParamType
<
T
>
;
switch
(
GetDesiredBlockDim
(
this
->
cols_
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
U
,
kBlockDim
>
LayerNormForward
<
T
,
U
,
kBlockDim
,
false
,
InType
,
OutType
>
<<<
this
->
rows_
,
kBlockDim
,
0
,
ctx
.
stream
()
>>>
(
src
,
gamma
,
beta
,
out
,
mean
,
variance
,
epsilon_
,
this
->
cols_
));
}
...
...
@@ -349,17 +390,25 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
// out = layernorm(residual + dropout(src + bias))
template
<
typename
P
=
LayerNormParamType
<
T
>,
bool
is_same_type
=
false
>
void
LayernormResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
void
LayernormResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
const
P
*
gamma
,
const
P
*
beta
,
T
*
dropout_out
,
MaskType
*
mask
,
T
*
out
,
OutType
*
out
,
LayerNormParamType
<
T
>*
mean
,
LayerNormParamType
<
T
>*
variance
)
{
LayerNormParamType
<
T
>*
variance
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
using
U
=
LayerNormParamType
<
T
>
;
int
vec_size
=
MAX_CACHE_BYTES
/
sizeof
(
T
);
if
(
this
->
cols_
%
vec_size
!=
0
)
{
...
...
@@ -368,7 +417,12 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
int
threads
=
GetDesiredBlockDim
(
this
->
cols_
/
vec_size
);
int
increment
=
((
this
->
cols_
-
1
)
/
(
threads
*
vec_size
)
+
1
)
*
vec_size
;
increment
=
this
->
dropout_param_
.
UpdateSeedAndIncrement
(
ctx
,
increment
);
LaunchLayernormResidualDropoutBias
<
T
,
MaskType
,
U
,
is_same_type
>
(
LaunchLayernormResidualDropoutBias
<
T
,
MaskType
,
U
,
is_same_type
,
InType
,
OutType
>
(
this
->
rows_
,
this
->
cols_
,
increment
,
...
...
@@ -387,7 +441,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
out
,
mean
,
variance
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
template
<
typename
P
=
LayerNormParamType
<
T
>,
bool
is_same_type
=
false
>
...
...
paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
浏览文件 @
3d7e2118
...
...
@@ -418,7 +418,9 @@ template <typename T,
int
THREADS_PER_CTA
=
WARPS_M
*
THREADS_PER_ROW
,
int
ROWS_PER_CTA
=
WARPS_M
,
int
ELTS_PER_ROW_PER_CTA
=
THREADS_PER_ROW
*
VecSize
,
int
LDGS
=
ELTS_PER_ROW
/
ELTS_PER_ROW_PER_CTA
>
int
LDGS
=
ELTS_PER_ROW
/
ELTS_PER_ROW_PER_CTA
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
__launch_bounds__
(
THREADS_PER_CTA
)
void
fused_fast_ln_fwd_kernel
(
int
rows
,
int
cols
,
...
...
@@ -428,7 +430,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
const
bool
is_test
,
const
uint64_t
increment
,
const
float
epsilon
,
const
T
*
__restrict__
x_ptr
,
const
InType
*
__restrict__
x_ptr
,
const
T
*
__restrict__
residual_ptr
,
const
T
*
__restrict__
bias_ptr
,
const
ScaleT
*
__restrict__
gamma_ptr
,
...
...
@@ -437,10 +439,20 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
U
*
__restrict__
mean_out_ptr
,
U
*
__restrict__
var_out_ptr
,
T
*
__restrict__
residual_out_ptr
,
T
*
__restrict__
y_ptr
)
{
OutType
*
__restrict__
y_ptr
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
__restrict__
quant_out_scale_ptr
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
__shared__
U
smem
[
WARPS_M
*
WARPS_N
];
using
Vec
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
using
Vec_scale
=
phi
::
AlignedVector
<
ScaleT
,
VecSize
>
;
using
Vec_in_type
=
phi
::
AlignedVector
<
InType
,
VecSize
>
;
using
Vec_out_type
=
phi
::
AlignedVector
<
OutType
,
VecSize
>
;
using
Vec_float
=
phi
::
AlignedVector
<
float
,
VecSize
>
;
using
MaskStoreT
=
phi
::
AlignedVector
<
MaskType
,
VecSize
>
;
const
int
tidx
=
threadIdx
.
x
;
...
...
@@ -481,12 +493,21 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
constexpr
U
rn
=
1.
f
/
U
(
ELTS_PER_ROW
);
for
(
int
row
=
r
;
row
<
rows
;
row
+=
gridDim
.
x
*
ROWS_PER_CTA
)
{
Vec
x
[
LDGS
];
Vec_in_type
x_input
[
LDGS
];
Vec
residual
[
LDGS
];
Vec_float
dequant_out_scale
[
LDGS
];
#pragma unroll
for
(
int
it
=
0
,
col
=
c
;
it
<
LDGS
;
it
++
)
{
phi
::
Load
<
T
,
VecSize
>
(
x_ptr
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
,
&
x
[
it
]);
phi
::
Load
<
T
,
VecSize
>
(
residual_ptr
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
,
&
residual
[
it
]);
phi
::
Load
<
InType
,
VecSize
>
(
x_ptr
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
,
&
x_input
[
it
]);
if
(
quant_out_scale_ptr
!=
nullptr
)
{
phi
::
Load
<
float
,
VecSize
>
(
quant_out_scale_ptr
+
quant_out_scale_offset
+
col
*
VecSize
,
&
dequant_out_scale
[
it
]);
}
col
+=
THREADS_PER_ROW
;
}
...
...
@@ -520,20 +541,42 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
#pragma unroll
for
(
int
jt
=
0
;
jt
<
VecSize
;
jt
++
)
{
// dropout(x) + residual
x
[
it
][
jt
]
=
(
x
[
it
][
jt
]
+
bias
[
it
][
jt
])
*
if
(
std
::
is_same
<
InType
,
int32_t
>::
value
)
{
T
tmp
=
(
static_cast
<
T
>
(
static_cast
<
float
>
(
x_input
[
it
][
jt
])
*
quant_last_in_scale
/
dequant_out_scale
[
it
][
jt
])
+
bias
[
it
][
jt
])
*
static_cast
<
T
>
(
mask_vec
[
it
][
jt
])
*
factor
+
residual
[
it
][
jt
];
x
[
it
][
jt
]
=
tmp
;
xf
[
it
*
VecSize
+
jt
]
=
U
(
tmp
);
}
else
{
x
[
it
][
jt
]
=
(
static_cast
<
T
>
(
x_input
[
it
][
jt
])
+
bias
[
it
][
jt
])
*
static_cast
<
T
>
(
mask_vec
[
it
][
jt
])
*
factor
+
residual
[
it
][
jt
];
xf
[
it
*
VecSize
+
jt
]
=
U
(
x
[
it
][
jt
]);
}
}
}
}
else
{
#pragma unroll
for
(
int
it
=
0
;
it
<
LDGS
;
it
++
)
{
#pragma unroll
for
(
int
jt
=
0
;
jt
<
VecSize
;
jt
++
)
{
// dropout(x) + residual
x
[
it
][
jt
]
=
x
[
it
][
jt
]
*
static_cast
<
T
>
(
mask_vec
[
it
][
jt
])
*
factor
+
if
(
std
::
is_same
<
InType
,
int32_t
>::
value
)
{
// for int32 input, we need to dequantize.
T
tmp
=
static_cast
<
T
>
(
static_cast
<
float
>
(
x_input
[
it
][
jt
])
*
quant_last_in_scale
/
dequant_out_scale
[
it
][
jt
])
*
static_cast
<
T
>
(
mask_vec
[
it
][
jt
])
*
factor
+
residual
[
it
][
jt
];
x
[
it
][
jt
]
=
tmp
;
}
else
{
x
[
it
][
jt
]
=
static_cast
<
T
>
(
x_input
[
it
][
jt
])
*
static_cast
<
T
>
(
mask_vec
[
it
][
jt
])
*
factor
+
residual
[
it
][
jt
];
}
xf
[
it
*
VecSize
+
jt
]
=
U
(
x
[
it
][
jt
]);
}
}
...
...
@@ -626,6 +669,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
var_out_ptr
[
row
]
=
var_local
*
rn
;
}
Vec_out_type
x_output
[
LDGS
];
#pragma unroll
for
(
int
it
=
0
;
it
<
LDGS
;
it
++
)
{
#pragma unroll
...
...
@@ -638,12 +683,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
U
tmp
=
rsigma
*
(
static_cast
<
U
>
(
xf
[
it
*
VecSize
+
jt
])
-
mu_local
);
x
[
it
][
jt
]
=
static_cast
<
T
>
(
static_cast
<
U
>
(
gamma
[
it
][
jt
])
*
tmp
+
static_cast
<
U
>
(
beta
[
it
][
jt
]));
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
x_output
[
it
][
jt
]
=
quant_helper
(
x
[
it
][
jt
],
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
}
#pragma unroll
for
(
int
it
=
0
,
col
=
c
;
it
<
LDGS
;
it
++
)
{
phi
::
Store
<
T
,
VecSize
>
(
x
[
it
],
y_ptr
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
);
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
phi
::
Store
<
OutType
,
VecSize
>
(
x_output
[
it
],
y_ptr
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
);
}
else
{
phi
::
Store
<
T
,
VecSize
>
(
x
[
it
],
reinterpret_cast
<
T
*>
(
y_ptr
)
+
row
*
ELTS_PER_ROW
+
col
*
VecSize
);
}
col
+=
THREADS_PER_ROW
;
}
}
...
...
@@ -668,7 +727,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
template
<
typename
T
,
typename
MaskType
,
typename
U
,
bool
ScaleBiasWithSameTypeX
=
false
>
bool
ScaleBiasWithSameTypeX
=
false
,
typename
InType
=
T
,
typename
OutType
=
T
>
void
LaunchLayernormResidualDropoutBias
(
const
uint32_t
rows
,
const
uint32_t
cols
,
...
...
@@ -678,18 +739,26 @@ void LaunchLayernormResidualDropoutBias(
const
float
epsilon
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
const
LayerNormScaleBiasT
<
T
,
U
,
ScaleBiasWithSameTypeX
>
*
scale
,
const
LayerNormScaleBiasT
<
T
,
U
,
ScaleBiasWithSameTypeX
>
*
layernorm_bias
,
MaskType
*
mask_data
,
T
*
dst
,
T
*
layernorm_dst
,
OutType
*
layernorm_dst
,
LayerNormParamType
<
T
>
*
mean
,
LayerNormParamType
<
T
>
*
var
,
const
phi
::
GPUContext
&
ctx
)
{
const
phi
::
GPUContext
&
ctx
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
// dropout_prob == 1.0f
// NOTE(minghaoBD): OutType should be T if drop_out_rate == 1.0
if
(
std
::
abs
(
dropout_prob
-
1.0
f
)
<
1e-5
)
{
auto
cuda_place
=
ctx
.
GetPlace
();
memory
::
Copy
(
cuda_place
,
...
...
@@ -705,10 +774,11 @@ void LaunchLayernormResidualDropoutBias(
switch
(
GetDesiredBlockDim
(
cols
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
U
,
kBlockDim
,
ScaleBiasWithSameTypeX
>
<<<
rows
,
kBlockDim
,
0
,
ctx
.
stream
()
>>>
(
dst
,
<<<
rows
,
kBlockDim
,
0
,
ctx
.
stream
()
>>>
(
dst
,
scale
,
layernorm_bias
,
layernorm_dst
,
reinterpret_cast
<
T
*>
(
layernorm_dst
)
,
mean
,
var
,
epsilon
,
...
...
@@ -731,6 +801,9 @@ void LaunchLayernormResidualDropoutBias(
const int VecSize = BYTES_PER_LDG / sizeof(T); \
const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; \
const int ROWS_PER_CTA = WARPS_M; \
const int THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP; \
const int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW * VecSize; \
const int LDGS = cols / ELTS_PER_ROW_PER_CTA; \
const int grid = \
static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
fused_fast_ln_fwd_kernel< \
...
...
@@ -742,7 +815,16 @@ void LaunchLayernormResidualDropoutBias(
WARPS_M, \
WARPS_N, \
BYTES_PER_LDG, \
cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows, \
cols, \
THREADS_PER_WARP, \
THREADS_PER_ROW, \
THREADS_PER_CTA, \
ROWS_PER_CTA, \
ELTS_PER_ROW_PER_CTA, \
LDGS, \
InType, \
OutType> \
<<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows, \
cols, \
seed, \
dropout_prob, \
...
...
@@ -759,7 +841,14 @@ void LaunchLayernormResidualDropoutBias(
mean, \
var, \
dst, \
layernorm_dst); \
layernorm_dst, \
quant_last_in_scale, \
dequant_out_scale_data, \
quant_out_scale_offset, \
quant_next_in_scale, \
quant_round_type, \
quant_max_bound, \
quant_min_bound); \
} break
#define LAUNCH_FUSED_FAST_LN_KERNEL \
...
...
@@ -784,7 +873,8 @@ void LaunchLayernormResidualDropoutBias(
if
(
cols
%
VecSize
!=
0
)
{
int
blockDim
=
GetDesiredBlockDim
(
cols
);
FusedLayernormResidualDropoutBias
<
T
,
uint8_t
,
1
,
U
,
ScaleBiasWithSameTypeX
>
<<<
rows
,
blockDim
,
0
,
ctx
.
stream
()
>>>
(
rows
,
<<<
rows
,
blockDim
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
seed
,
dropout_prob
,
...
...
@@ -792,14 +882,14 @@ void LaunchLayernormResidualDropoutBias(
is_test
,
increment
,
epsilon
,
src
,
reinterpret_cast
<
const
T
*>
(
src
)
,
residual
,
bias
,
scale
,
layernorm_bias
,
mask_data
,
dst
,
layernorm_dst
,
reinterpret_cast
<
T
*>
(
layernorm_dst
)
,
mean
,
var
);
}
else
{
...
...
@@ -819,7 +909,8 @@ void LaunchLayernormResidualDropoutBias(
VecSize
,
U
,
ScaleBiasWithSameTypeX
>
<<<
rows
,
blockDim
,
0
,
ctx
.
stream
()
>>>
(
rows
,
<<<
rows
,
blockDim
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
seed
,
dropout_prob
,
...
...
@@ -827,14 +918,14 @@ void LaunchLayernormResidualDropoutBias(
is_test
,
increment
,
epsilon
,
src
,
reinterpret_cast
<
const
T
*>
(
src
)
,
residual
,
bias
,
scale
,
layernorm_bias
,
mask_data
,
dst
,
layernorm_dst
,
reinterpret_cast
<
T
*>
(
layernorm_dst
)
,
mean
,
var
);
}
...
...
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
FusedMultiTransformerINT8Op
:
public
framework
::
OperatorWithKernel
{
private:
static
constexpr
const
char
*
OpName
=
"FusedMultiTransformerINT8Op"
;
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
#define CHECK_INPUT(name) \
OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
#define CHECK_INPUTS(name) \
OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
#define CHECK_OUTPUT(name) \
OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
#define CHECK_OUTPUTS(name) \
OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
CHECK_INPUT
(
X
);
// attention
CHECK_INPUTS
(
QKVW
);
CHECK_INPUTS
(
OutLinearW
);
if
(
ctx
->
HasInput
(
"TimeStep"
))
{
CHECK_INPUTS
(
CacheKV
);
}
if
(
ctx
->
HasInputs
(
"CacheKV"
))
{
CHECK_OUTPUTS
(
CacheKVOut
);
}
// ffn
CHECK_INPUTS
(
FFN1Weight
);
CHECK_INPUTS
(
FFN2Weight
);
CHECK_OUTPUT
(
Out
);
// x: qkv's input [batch_size, seq_len, dim_embed]
// y: qkv's weight: [3, num_head, dim_head, dim_embed]
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
y_dim
=
ctx
->
GetInputsDim
(
"QKVW"
)[
0
];
bool
trans_qkvw
=
ctx
->
Attrs
().
Get
<
bool
>
(
"trans_qkvw"
);
PADDLE_ENFORCE_EQ
(
x_dim
.
size
(),
3
,
platform
::
errors
::
InvalidArgument
(
"The dimensions of x must be 3"
"(batch_size, seq_len, dim_embed),"
"but received dimensions of"
"Input is [%d]"
,
x_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
y_dim
.
size
(),
4
,
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"but received dimensions of"
"Input is [%d]"
,
y_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
x_dim
[
2
],
trans_qkvw
?
y_dim
[
3
]
:
y_dim
[
0
],
platform
::
errors
::
InvalidArgument
(
"ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
"true) or y_dim[0](trans_qkvw is false)"
"must be equal. But received: the shape "
"of input x = [%s], and the shape of "
"input qkv_weight = [%s]"
,
x_dim
,
y_dim
));
if
(
ctx
->
Attrs
().
Get
<
int
>
(
"ring_id"
)
==
-
1
)
{
if
(
trans_qkvw
)
{
PADDLE_ENFORCE_EQ
(
y_dim
[
1
]
*
y_dim
[
2
],
y_dim
[
3
],
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"
));
}
else
{
PADDLE_ENFORCE_EQ
(
y_dim
[
2
]
*
y_dim
[
3
],
y_dim
[
0
],
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(dim_embed, 3, num_head, dim_head),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"
));
}
}
if
(
ctx
->
HasInputs
(
"CacheKV"
))
{
// [2, batch_size, num_head, max_seq_len, head_size]
const
auto
&
c_dims
=
ctx
->
GetInputsDim
(
"CacheKV"
);
const
auto
&
c_dim
=
c_dims
[
0
];
PADDLE_ENFORCE_EQ
(
c_dim
.
size
(),
5
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The CacheKV must be 5 dims, but got %d"
,
c_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
c_dim
[
0
],
2
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The first dim of CacheKV must be 2, but got %d"
,
c_dim
[
0
]));
// 2
PADDLE_ENFORCE_EQ
(
c_dim
[
1
],
x_dim
[
0
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The second dim of CacheKV must be equal with "
"batch size %d, but got %d"
,
x_dim
[
0
],
c_dim
[
1
]));
// batch_size
PADDLE_ENFORCE_EQ
(
c_dim
[
2
],
trans_qkvw
?
y_dim
[
1
]
:
y_dim
[
2
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The third dim of CacheKV must be equal with num "
"head %d, but got %d"
,
trans_qkvw
?
y_dim
[
1
]
:
y_dim
[
2
],
c_dim
[
2
]));
// num_head
PADDLE_ENFORCE_GT
(
c_dim
[
3
],
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The forth dim of CacheKV must be greater than 0, but got %d"
,
c_dim
[
3
]));
// cache_seq_len
PADDLE_ENFORCE_EQ
(
c_dim
[
4
],
trans_qkvw
?
y_dim
[
2
]
:
y_dim
[
3
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The fifth dim of CacheKV must be equal with head "
"size %d, but got %d"
,
trans_qkvw
?
y_dim
[
2
]
:
y_dim
[
3
],
c_dim
[
4
]));
// head_size
}
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
());
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"TimeStep"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
}
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
};
class
FusedMultiTransformerINT8OpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input tensor."
);
AddInput
(
"LnScale"
,
"Scale is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor."
)
.
AsDuplicable
();
AddInput
(
"LnBias"
,
"Bias is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor."
)
.
AsDuplicable
();
AddInput
(
"QKVW"
,
"The qkv weight tensor."
).
AsDuplicable
();
AddInput
(
"QKVBias"
,
"The qkv bias tensor."
).
AsDispensable
().
AsDuplicable
();
AddInput
(
"CacheKV"
,
"(optional) The cached KV for generation inference."
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"TimeStep"
,
"(optional, int) The time step for generation inference."
)
.
AsDispensable
();
AddInput
(
"SrcMask"
,
"(optional) The attention mask tensor in fmha."
)
.
AsDispensable
();
AddInput
(
"OutLinearW"
,
"The out_linear weight tensor."
).
AsDuplicable
();
AddInput
(
"OutLinearBias"
,
"The out_linear bias tensor."
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"FFNLnScale"
,
"The layer_norm scale of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFNLnBias"
,
"The layer_norm bias of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN1Weight"
,
"The linear1 weight of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN1Bias"
,
"The linear1 bias of FusedFeedForward op"
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"FFN2Weight"
,
"The linear2 weight of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN2Bias"
,
"The linear2 bias input of FusedFeedForward op"
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"QKVOutScale"
,
"QKVOutScale is used to dequantize qkv output tensor."
"In order to keep consistent with the PTQ/QAT calculation logic,"
"QKVOutScale should be max_bound * max_bound / max_range."
"Here max_range is per-channel weight scale."
"The shape of QKVOutScale is [num_layers, num_channels]"
)
.
AsDispensable
();
AddInput
(
"OutLinearOutScale"
,
"OutLinearOutScale is used to dequantize out_linear output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddInput
(
"FFN1OutScale"
,
"FFN1OutScale is used to dequantize ffn1 output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddInput
(
"FFN2OutScale"
,
"FFN2OutScale is used to dequantize ffn2 output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddOutput
(
"CacheKVOut"
,
"The updated cache KV. Inplace with CacheKV"
)
.
AsDispensable
()
.
AsDuplicable
();
AddOutput
(
"Out"
,
"Result after multi ."
);
AddAttr
<
bool
>
(
"pre_layer_norm"
,
"if true, the attention op uses pre_layer_norm architecure, "
"else, uses post_layer_norm architecuture. "
"[default true]."
)
.
SetDefault
(
true
);
AddAttr
<
float
>
(
"epsilon"
,
"Constant for numerical stability [default 1e-5]."
)
.
SetDefault
(
1e-5
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE_EQ
(
epsilon
>=
0.0
f
&&
epsilon
<=
0.001
f
,
true
,
platform
::
errors
::
InvalidArgument
(
"'epsilon' in Op(LayerNorm) should be between"
"0.0 and 0.001, But received [%s]."
,
epsilon
));
});
AddAttr
<
float
>
(
"dropout_rate"
,
"Probability of setting units to zero."
)
.
SetDefault
(
.5
f
)
.
AddCustomChecker
([](
const
float
&
drop_p
)
{
PADDLE_ENFORCE_EQ
(
drop_p
>=
0.0
f
&&
drop_p
<=
1.0
f
,
true
,
platform
::
errors
::
InvalidArgument
(
"'dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"dropout_implementation"
,
"[
\"
downgrade_in_infer
\"
|
\"
upscale_in_train
\"
]"
"The meaning is the same as 'attn_dropout_implementation'."
)
.
SetDefault
(
"downgrade_in_infer"
)
.
AddCustomChecker
([](
const
std
::
string
&
type
)
{
PADDLE_ENFORCE_EQ
(
type
==
"downgrade_in_infer"
||
type
==
"upscale_in_train"
,
true
,
platform
::
errors
::
InvalidArgument
(
"dropout_implementation can only be downgrade_in_infer or "
"upscale_in_train"
));
});
AddAttr
<
std
::
string
>
(
"act_method"
,
"act_method"
).
SetDefault
(
"gelu"
);
AddAttr
<
bool
>
(
"trans_qkvw"
,
"Whether the weights of qkv should be transposed. If true,"
"the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
"Otherwise the shape of weights of qkv should be"
"[dim_embed, 3, num_head, dim_head]"
)
.
SetDefault
(
true
);
AddAttr
<
int
>
(
"ring_id"
,
"ring id for tensor model parallel. distributed training and inference"
)
.
SetDefault
(
-
1
);
AddAttr
<
int
>
(
"num_head"
,
"num_head"
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dim_head"
,
"dim_head"
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dim_ffn"
,
"dim_ffn"
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
float
>>
(
"qkv_in_scale"
,
"qkv_in_scale is used to quantize qkv input tensor."
"in_scale is generated by PTQ or QAT, which represents valid max range "
"of this tensor."
"the size of qkv_in_scale should be num_layers, which is equal to "
"QKVW.dims()[0]"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"out_linear_in_scale"
,
"out_linear_in_scale is used to quantize out_linear input tensor."
"the size of out_linear_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"ffn1_in_scale"
,
"ffn1_in_scale is used to quantize ffn1 input tensor."
"the size of ffn1_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"ffn2_in_scale"
,
"ffn2_in_scale is used to quantize ffn2 input tensor."
"the size of ffn2_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
int
>
(
"quant_round_type"
,
"(int, default 1) The round type of fp32 to int."
"0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
"1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
"round(-2.5)=-3"
)
.
SetDefault
(
1
);
AddAttr
<
float
>
(
"quant_max_bound"
,
"(float, default 127.0) the max bound of float type to int type"
)
.
SetDefault
(
127.0
);
AddAttr
<
float
>
(
"quant_min_bound"
,
"(float, default -127.0) the min bound of float type to int type"
)
.
SetDefault
(
-
127.0
);
AddComment
(
R"DOC(fused multi transformer layers op)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_multi_transformer_int8
,
ops
::
FusedMultiTransformerINT8Op
,
ops
::
FusedMultiTransformerINT8OpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_multi_transformer_op.h
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
浏览文件 @
3d7e2118
...
...
@@ -28,7 +28,9 @@ template <typename T,
int
VecSize
,
bool
ComputeLayerNorm
,
bool
Activation
,
typename
Functor
>
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
__forceinline__
__device__
void
FusedResidualDropoutBiasOneThread
(
const
int
row_id
,
const
int
col_id
,
...
...
@@ -36,30 +38,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
curandStatePhilox4_32_10_t
*
state
,
const
float
dropout_prob
,
const
T
factor
,
const
T
*
__restrict__
src
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
residual
,
const
T
*
__restrict__
bias
,
T
*
dst
,
OutType
*
dst
,
MaskType
*
mask
,
const
bool
is_test
,
typename
details
::
MPTypeTrait
<
T
>::
Type
*
mean_val
,
typename
details
::
MPTypeTrait
<
T
>::
Type
*
var_val
,
Functor
act_func
)
{
Functor
act_func
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
using
LoadT
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
using
LoadInType
=
phi
::
AlignedVector
<
InType
,
VecSize
>
;
using
LoadFloat
=
phi
::
AlignedVector
<
float
,
VecSize
>
;
using
StoreT
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
using
StoreOutType
=
phi
::
AlignedVector
<
OutType
,
VecSize
>
;
using
MaskStoreT
=
phi
::
AlignedVector
<
MaskType
,
VecSize
>
;
using
U
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
Load
T
src_vec
;
Load
InType
src_vec
;
LoadT
residual_vec
;
LoadT
bias_vec
;
LoadFloat
quant_out_scale_vec
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
VecSize
;
ii
++
)
{
bias_vec
[
ii
]
=
static_cast
<
T
>
(
0
);
residual_vec
[
ii
]
=
static_cast
<
T
>
(
0
);
}
// vectorize load data from global
phi
::
Load
<
T
,
VecSize
>
(
&
src
[
row_id
*
cols
+
col_id
],
&
src_vec
);
phi
::
Load
<
InType
,
VecSize
>
(
&
src
[
row_id
*
cols
+
col_id
],
&
src_vec
);
phi
::
Load
<
float
,
VecSize
>
(
&
dequant_out_scale_data
[
quant_out_scale_offset
+
col_id
],
&
quant_out_scale_vec
);
if
(
residual
)
{
phi
::
Load
<
T
,
VecSize
>
(
&
residual
[
row_id
*
cols
+
col_id
],
&
residual_vec
);
}
...
...
@@ -84,10 +101,18 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
}
StoreT
dest_vec
;
StoreOutType
dest_vec_out_type
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
VecSize
;
ii
++
)
{
T
tmp
=
src_vec
[
ii
]
+
bias_vec
[
ii
];
T
tmp
;
if
(
std
::
is_same
<
InType
,
int32_t
>::
value
)
{
T
tmp0
=
static_cast
<
T
>
(
static_cast
<
float
>
(
src_vec
[
ii
])
*
quant_last_in_scale
/
quant_out_scale_vec
[
ii
]);
tmp
=
tmp0
+
bias_vec
[
ii
];
}
else
{
tmp
=
static_cast
<
T
>
(
src_vec
[
ii
])
+
bias_vec
[
ii
];
}
if
(
Activation
)
{
tmp
=
act_func
(
tmp
);
}
...
...
@@ -98,10 +123,23 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
*
mean_val
+=
tmp
;
*
var_val
+=
(
tmp
*
tmp
);
}
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
dest_vec_out_type
[
ii
]
=
quant_helper
(
dest_vec
[
ii
],
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
}
// store result to global
phi
::
Store
<
T
,
VecSize
>
(
dest_vec
,
&
dst
[
row_id
*
cols
+
col_id
]);
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
phi
::
Store
<
OutType
,
VecSize
>
(
dest_vec_out_type
,
&
dst
[
row_id
*
cols
+
col_id
]);
}
else
{
phi
::
Store
<
T
,
VecSize
>
(
dest_vec
,
reinterpret_cast
<
T
*>
(
&
dst
[
row_id
*
cols
+
col_id
]));
}
if
(
!
is_test
)
{
phi
::
Store
<
MaskType
,
VecSize
>
(
mask_vec
,
&
mask
[
row_id
*
cols
+
col_id
]);
}
...
...
@@ -114,19 +152,28 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
* is_test: only used in inference
* mask: can be null if is_test=true
*/
template
<
typename
T
,
typename
MaskType
,
int
VecSize
>
__global__
void
FusedResidualDropoutBias
(
const
size_t
rows
,
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
void
FusedResidualDropoutBias
(
const
size_t
rows
,
const
size_t
cols
,
uint64_t
seed
,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
T
*
__restrict__
src
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
residual
,
const
T
*
__restrict__
bias
,
MaskType
*
mask
,
T
*
dst
,
OutType
*
dst
,
uint64_t
increment
,
const
bool
is_test
)
{
const
bool
is_test
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
int
col_id
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
row_id
=
blockIdx
.
y
;
int
idx
=
row_id
*
cols
+
col_id
;
...
...
@@ -142,8 +189,9 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
VecSize
,
false
,
false
,
phi
::
funcs
::
ReluFunctor
<
T
>>
(
r
,
phi
::
funcs
::
ReluFunctor
<
T
>
,
InType
,
OutType
>
(
r
,
i
,
cols
,
&
state
,
...
...
@@ -157,7 +205,11 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
is_test
,
nullptr
,
nullptr
,
relu
);
relu
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
}
...
...
@@ -165,7 +217,10 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
/**
* @brief dst = residual + dropout(src + bias);
*/
template
<
typename
T
,
typename
MaskType
>
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
void
LaunchResidualDropoutBias
(
const
uint32_t
rows
,
const
uint32_t
cols
,
const
int
increment
,
...
...
@@ -173,14 +228,19 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const
float
dropout_prob
,
const
bool
is_test
,
bool
is_upscale_in_train
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
MaskType
*
mask_data
,
T
*
dst
,
const
phi
::
GPUContext
&
ctx
)
{
OutType
*
dst
,
const
phi
::
GPUContext
&
ctx
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
// dropout_prob == 1.0f
if
(
std
::
abs
(
dropout_prob
-
1.0
f
)
<
1e-5
)
{
// NOTE(minghaoBD): OutType should be T if dropout_prob == 1.0
if
(
residual
==
dst
)
return
;
if
(
residual
)
{
memory
::
Copy
(
ctx
.
GetPlace
(),
...
...
@@ -202,7 +262,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const
int
real_vec_size
=
cols
%
VecSize
==
0
?
VecSize
:
1
;
auto
config
=
Get1DBlocksAnd2DGrids
(
ctx
,
rows
,
cols
,
real_vec_size
);
if
(
cols
%
VecSize
==
0
)
{
FusedResidualDropoutBias
<
T
,
uint8_t
,
VecSize
>
FusedResidualDropoutBias
<
T
,
uint8_t
,
VecSize
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
...
...
@@ -215,9 +275,13 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data
,
dst
,
increment
,
is_test
);
is_test
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
else
{
FusedResidualDropoutBias
<
T
,
uint8_t
,
1
>
FusedResidualDropoutBias
<
T
,
uint8_t
,
1
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
...
...
@@ -230,7 +294,11 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data
,
dst
,
increment
,
is_test
);
is_test
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
...
...
paddle/fluid/operators/fused/quant_dequant_kernel.h
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__forceinline__
__device__
int8_t
quant_helper
(
const
T
input
,
const
float
scale
,
const
int
round_type
,
const
float
max_bound
,
const
float
min_bound
)
{
float
quant_value
=
max_bound
*
inverse
(
scale
)
*
static_cast
<
float
>
(
input
);
if
(
round_type
==
0
)
{
quant_value
=
static_cast
<
float
>
(
roundWithTiesToEven
(
quant_value
));
}
else
{
quant_value
=
static_cast
<
float
>
(
round
(
quant_value
));
}
quant_value
=
quant_value
>
max_bound
?
max_bound
:
quant_value
;
quant_value
=
quant_value
<
min_bound
?
min_bound
:
quant_value
;
return
static_cast
<
int8_t
>
(
quant_value
);
}
template
<
typename
T
>
__global__
void
quantize_kernel
(
const
T
*
input
,
char4
*
output
,
const
float
scale
,
const
int
m
,
const
int
n
,
const
int
round_type
,
const
float
max_bound
,
const
float
min_bound
)
{
int
n_id
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
<<
2
;
int
m_id
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
bool
check
=
((
m_id
<
m
)
&&
(
n_id
<
n
));
if
(
check
)
{
char4
tmp
;
tmp
.
x
=
quant_helper
(
input
[
m_id
*
n
+
n_id
],
scale
,
round_type
,
max_bound
,
min_bound
);
tmp
.
y
=
quant_helper
(
input
[
m_id
*
n
+
n_id
+
1
],
scale
,
round_type
,
max_bound
,
min_bound
);
tmp
.
z
=
quant_helper
(
input
[
m_id
*
n
+
n_id
+
2
],
scale
,
round_type
,
max_bound
,
min_bound
);
tmp
.
w
=
quant_helper
(
input
[
m_id
*
n
+
n_id
+
3
],
scale
,
round_type
,
max_bound
,
min_bound
);
output
[(
m_id
*
n
+
n_id
)
>>
2
]
=
tmp
;
}
}
template
<
typename
T
>
void
quantize_kernel_launcher
(
const
T
*
input
,
int8_t
*
output
,
const
float
scale
,
const
int
m
,
const
int
n
,
const
int
round_type
,
const
float
max_bound
,
const
float
min_bound
,
gpuStream_t
stream
)
{
// TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
dim3
grid
((
n
+
31
)
/
32
,
(
m
+
31
)
/
32
);
dim3
block
(
32
,
32
);
quantize_kernel
<<<
grid
,
block
,
0
,
stream
>>>
(
input
,
(
char4
*
)
output
,
// NOLINT
scale
,
m
,
n
,
round_type
,
max_bound
,
min_bound
);
}
// dequantize using weight scales and input scales
template
<
typename
T
>
__global__
void
dequantize_kernel
(
T
*
output
,
const
int32_t
*
input
,
const
int
m
,
// hidden
const
int
n
,
// batch size
const
float
quant_in_scale
,
const
float
*
dequant_out_scale_data
,
const
int
quant_out_scale_offset
)
{
int
m_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// hidden
int
n_id
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
// batch size
bool
check
=
((
m_id
<
m
)
&&
(
n_id
<
n
));
if
(
check
)
{
float
out_scale
=
dequant_out_scale_data
[
quant_out_scale_offset
+
m_id
];
output
[
n_id
*
m
+
m_id
]
=
static_cast
<
T
>
(
static_cast
<
float
>
(
input
[
n_id
*
m
+
m_id
])
*
quant_in_scale
/
out_scale
);
}
}
template
<
typename
T
>
void
dequantize_kernel_launcher
(
const
int32_t
*
input
,
T
*
output
,
const
int
batch_size
,
// m
const
int
hidden_units
,
// n
gpuStream_t
stream
,
const
float
quant_in_scale
,
const
float
*
dequant_out_scale_data
,
const
int
quant_out_scale_offset
)
{
dim3
grid
((
hidden_units
+
31
)
/
32
,
(
batch_size
+
31
)
/
32
);
dim3
block
(
32
,
32
);
dequantize_kernel
<<<
grid
,
block
,
0
,
stream
>>>
(
output
,
input
,
hidden_units
,
batch_size
,
quant_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
);
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/layer_norm_kernel.cu.h
浏览文件 @
3d7e2118
...
...
@@ -24,6 +24,7 @@ namespace cub = hipcub;
#include <iostream>
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/core/ddim.h"
...
...
@@ -338,16 +339,24 @@ using LayerNormScaleBiasT =
template
<
typename
T
,
typename
U
,
int
BlockDim
,
bool
ScaleBiasWithSameTypeX
=
false
>
bool
ScaleBiasWithSameTypeX
=
false
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
void
LayerNormForward
(
const
T
*
x
,
const
InType
*
x
,
const
LayerNormScaleBiasT
<
T
,
U
,
ScaleBiasWithSameTypeX
>
*
scale
,
const
LayerNormScaleBiasT
<
T
,
U
,
ScaleBiasWithSameTypeX
>
*
bias
,
T
*
y
,
OutType
*
y
,
U
*
mean
,
U
*
var
,
float
epsilon
,
int64_t
feature_size
)
{
int64_t
feature_size
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
__shared__
U
mean_share
;
__shared__
U
var_share
;
__shared__
U
shared_mean
[
32
];
// threadIdx.x / warpSize <= kMaxBlockDim /
...
...
@@ -387,28 +396,72 @@ __global__ void LayerNormForward(
if
(
bias
!=
nullptr
)
{
for
(
int64_t
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
static_cast
<
T
>
(
static_cast
<
U
>
(
scale
[
j
])
*
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
y
[
i
]
=
quant_helper
(
static_cast
<
T
>
(
static_cast
<
U
>
(
scale
[
j
])
*
(
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
+
static_cast
<
U
>
(
bias
[
j
])),
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
y
[
i
]
=
static_cast
<
OutType
>
(
static_cast
<
U
>
(
scale
[
j
])
*
(
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
+
static_cast
<
U
>
(
bias
[
j
]));
}
}
}
else
{
for
(
int64_t
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
static_cast
<
T
>
(
static_cast
<
U
>
(
scale
[
j
])
*
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
y
[
i
]
=
quant_helper
(
static_cast
<
T
>
(
static_cast
<
U
>
(
scale
[
j
])
*
(
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
),
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
y
[
i
]
=
static_cast
<
OutType
>
(
static_cast
<
U
>
(
scale
[
j
])
*
(
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
);
}
}
}
}
else
{
// scale == nullptr
if
(
bias
!=
nullptr
)
{
for
(
int64_t
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
static_cast
<
T
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
+
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
y
[
i
]
=
quant_helper
(
static_cast
<
T
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
+
static_cast
<
U
>
(
bias
[
j
])),
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
y
[
i
]
=
static_cast
<
OutType
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
+
static_cast
<
U
>
(
bias
[
j
]));
}
}
}
else
{
for
(
int64_t
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
static_cast
<
T
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
);
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
y
[
i
]
=
quant_helper
(
static_cast
<
T
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
),
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
y
[
i
]
=
static_cast
<
OutType
>
((
static_cast
<
U
>
(
x
[
i
])
-
mean_val
)
*
invvar
);
}
}
}
}
...
...
paddle/fluid/platform/dynload/cublasLt.h
浏览文件 @
3d7e2118
...
...
@@ -59,7 +59,9 @@ namespace dynload {
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH
(
PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
)
// #endif
...
...
paddle/fluid/pybind/op_function_generator.h
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/phi/backends/dynload/cublasLt.h
浏览文件 @
3d7e2118
...
...
@@ -73,7 +73,9 @@ extern void *cublasLt_dso_handle;
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
)
// #endif
...
...
paddle/phi/backends/dynload/dynamic_loader.cc
浏览文件 @
3d7e2118
...
...
@@ -326,7 +326,7 @@ void* GetCublasDsoHandle() {
void
*
GetCublasLtDsoHandle
()
{
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10
10
0
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10
01
0
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcublasLt.so"
);
#else
std
::
string
warning_msg
(
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录