Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3d7e2118
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3d7e2118
编写于
9月 18, 2022
作者:
R
RichardWooSJTU
提交者:
GitHub
9月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add INT8 support for fused_multi_transformer_op (#45284)
上级
7f346a76
变更
22
展开全部
隐藏空白更改
内联
并排
Showing
22 changed file
with
4168 addition
and
1428 deletion
+4168
-1428
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+2
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+2
-0
paddle/fluid/operators/fused/attention_layer_norm.h
paddle/fluid/operators/fused/attention_layer_norm.h
+24
-6
paddle/fluid/operators/fused/attn_gemm_int8.h
paddle/fluid/operators/fused/attn_gemm_int8.h
+189
-0
paddle/fluid/operators/fused/cublaslt.h
paddle/fluid/operators/fused/cublaslt.h
+211
-0
paddle/fluid/operators/fused/fused_dropout_act_bias.h
paddle/fluid/operators/fused/fused_dropout_act_bias.h
+65
-24
paddle/fluid/operators/fused/fused_dropout_common.h
paddle/fluid/operators/fused/fused_dropout_common.h
+1
-0
paddle/fluid/operators/fused/fused_dropout_helper.h
paddle/fluid/operators/fused/fused_dropout_helper.h
+116
-55
paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
...d/operators/fused/fused_layernorm_residual_dropout_bias.h
+188
-97
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
.../fluid/operators/fused/fused_multi_transformer_int8_op.cc
+369
-0
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
.../fluid/operators/fused/fused_multi_transformer_int8_op.cu
+670
-0
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+5
-1147
paddle/fluid/operators/fused/fused_multi_transformer_op.h
paddle/fluid/operators/fused/fused_multi_transformer_op.h
+1161
-0
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+113
-45
paddle/fluid/operators/fused/quant_dequant_kernel.h
paddle/fluid/operators/fused/quant_dequant_kernel.h
+136
-0
paddle/fluid/operators/layer_norm_kernel.cu.h
paddle/fluid/operators/layer_norm_kernel.cu.h
+65
-12
paddle/fluid/platform/dynload/cublasLt.h
paddle/fluid/platform/dynload/cublasLt.h
+22
-20
paddle/fluid/pybind/op_function_generator.h
paddle/fluid/pybind/op_function_generator.h
+8
-0
paddle/phi/backends/dynload/cublasLt.h
paddle/phi/backends/dynload/cublasLt.h
+22
-20
paddle/phi/backends/dynload/dynamic_loader.cc
paddle/phi/backends/dynload/dynamic_loader.cc
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+6
-0
python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
...d/tests/unittests/test_fused_multi_transformer_int8_op.py
+792
-0
未找到文件。
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
3d7e2118
...
...
@@ -165,7 +165,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
auto
var_data_type
=
var_node
->
Var
()
->
GetDataType
();
VLOG
(
5
)
<<
"var_name is "
<<
var_name
<<
", data type is "
<<
var_data_type
;
if
(
var_data_type
==
paddle
::
framework
::
proto
::
VarType
::
FP16
)
{
if
(
var_data_type
==
paddle
::
framework
::
proto
::
VarType
::
FP16
&&
t
->
dtype
()
!=
paddle
::
experimental
::
DataType
::
FLOAT16
)
{
framework
::
Tensor
half_tensor
;
half_tensor
.
set_type
(
paddle
::
experimental
::
DataType
::
FLOAT16
);
half_tensor
.
Resize
(
t
->
dims
());
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
3d7e2118
...
...
@@ -23,6 +23,7 @@ register_operators(
fused_transformer_op
fused_feedforward_op
fused_multi_transformer_op
fused_multi_transformer_int8_op
fused_bias_dropout_residual_layer_norm_op
resnet_unit_op
fused_gemm_epilogue_op
...
...
@@ -119,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
# fused_attention_op
op_library
(
fused_attention_op
)
op_library
(
fused_multi_transformer_op
)
op_library
(
fused_multi_transformer_int8_op
)
op_library
(
fused_bias_dropout_residual_layer_norm_op
)
endif
()
# resnet_unit needs cudnn 8.0 above
...
...
paddle/fluid/operators/fused/attention_layer_norm.h
浏览文件 @
3d7e2118
...
...
@@ -19,7 +19,8 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
// NOTE: T must be the same as OutType in ComputeBackward
template
<
typename
T
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
AttnLayerNorm
{
public:
AttnLayerNorm
(
const
phi
::
GPUContext
&
dev_ctx
,
...
...
@@ -33,17 +34,28 @@ class AttnLayerNorm {
~
AttnLayerNorm
()
{}
void
ComputeForward
(
const
T
*
x_data
,
void
ComputeForward
(
const
InType
*
x_data
,
const
LayerNormParamType
<
T
>*
scale_data
,
const
LayerNormParamType
<
T
>*
bias_data
,
T
*
y_data
,
OutType
*
y_data
,
LayerNormParamType
<
T
>*
mean_data
,
LayerNormParamType
<
T
>*
var_data
)
{
LayerNormParamType
<
T
>*
var_data
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
auto
stream
=
dev_ctx_
.
stream
();
switch
(
GetDesiredBlockDim
(
feature_size_
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
LayerNormParamType
<
T
>
,
kBlockDim
>
LayerNormForward
<
T
,
LayerNormParamType
<
T
>
,
kBlockDim
,
false
,
InType
,
OutType
>
<<<
batch_size_
,
kBlockDim
,
0
,
stream
>>>
(
x_data
,
scale_data
,
bias_data
,
...
...
@@ -51,7 +63,13 @@ class AttnLayerNorm {
mean_data
,
var_data
,
epsilon_
,
feature_size_
));
feature_size_
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
));
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Feature_size must be larger than 1"
));
...
...
paddle/fluid/operators/fused/attn_gemm_int8.h
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <vector>
#include "paddle/fluid/operators/fused/cublaslt.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
class
AttnMatmulINT8
{
public:
AttnMatmulINT8
(
const
phi
::
GPUContext
&
dev_ctx
,
int
m
,
int
n
,
int
k
,
bool
compute_bias
)
:
dev_ctx_
(
dev_ctx
),
m_
(
m
),
n_
(
n
),
k_
(
k
),
compute_bias_
(
compute_bias
)
{
auto
helper
=
std
::
make_shared
<
CublasLtHelper
>
(
m
,
k
,
n
);
helpers_
.
emplace_back
(
helper
);
}
~
AttnMatmulINT8
()
{}
// This function is used to execute GEMM, with input and output's types are
// both T.
void
ComputeForward
(
const
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
input_tmp
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output_tmp
,
framework
::
Tensor
*
bias_out
,
const
float
quant_in_scale
,
const
framework
::
Tensor
*
dequant_out_scale
,
const
int
quant_out_scale_offset
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
quantize_kernel_launcher
<
T
>
(
input
->
data
<
T
>
(),
input_tmp
->
data
<
int8_t
>
(),
quant_in_scale
,
m_
,
k_
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
,
dev_ctx_
.
stream
());
helpers_
[
0
]
->
GEMM
(
input_tmp
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output_tmp
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
dequantize_kernel_launcher
<
T
>
(
output_tmp
->
data
<
int32_t
>
(),
output
->
data
<
T
>
(),
m_
,
n_
,
dev_ctx_
.
stream
(),
quant_in_scale
,
dequant_out_scale
->
data
<
float
>
(),
quant_out_scale_offset
);
if
(
compute_bias_
)
{
// bias_out = output + bias
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
output
,
bias
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
bias_out
};
phi
::
funcs
::
BroadcastKernel
<
phi
::
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
PADDLE_ENFORCE_EQ
(
cudaGetLastError
(),
cudaSuccess
,
platform
::
errors
::
Fatal
(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"
));
}
}
// This function is used to execute GEMM, with input and output's types are
// both INT8.
void
ComputeForwardINT8ToINT8
(
const
framework
::
Tensor
*
weight
,
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
bias_out
)
{
helpers_
[
0
]
->
GEMM
(
input
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
}
// This function is used to execute GEMM, with input and output's types are
// INT8 and T.
void
ComputeForwardINT8ToT
(
const
framework
::
Tensor
*
weight
,
const
float
quant_in_scale
,
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output_tmp
,
framework
::
Tensor
*
bias_out
,
const
framework
::
Tensor
*
dequant_out_scale
,
const
int
quant_out_scale_offset
)
{
helpers_
[
0
]
->
GEMM
(
input
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output_tmp
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
dequantize_kernel_launcher
<
T
>
(
output_tmp
->
data
<
int32_t
>
(),
output
->
data
<
T
>
(),
m_
,
n_
,
dev_ctx_
.
stream
(),
quant_in_scale
,
dequant_out_scale
->
data
<
float
>
(),
quant_out_scale_offset
);
if
(
compute_bias_
)
{
// bias_out = output + bias
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
output
,
bias
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
bias_out
};
phi
::
funcs
::
BroadcastKernel
<
phi
::
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
PADDLE_ENFORCE_EQ
(
cudaGetLastError
(),
cudaSuccess
,
platform
::
errors
::
Fatal
(
"cuda error occured after computing bias. "
"But it does not mean this error is caused by "
"bias computing"
));
}
}
// This function is used to execute GEMM, with input and output's types are T
// and INT8.
void
ComputeForwardTToINT8
(
const
framework
::
Tensor
*
weight
,
const
float
quant_in_scale
,
const
framework
::
Tensor
*
input
,
framework
::
Tensor
*
input_tmp
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
bias_out
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
quantize_kernel_launcher
<
T
>
(
input
->
data
<
T
>
(),
input_tmp
->
data
<
int8_t
>
(),
quant_in_scale
,
m_
,
k_
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
,
dev_ctx_
.
stream
());
helpers_
[
0
]
->
GEMM
(
input_tmp
->
data
<
int8_t
>
(),
weight
->
data
<
int8_t
>
(),
output
->
data
<
int32_t
>
(),
dev_ctx_
.
stream
());
}
private:
const
phi
::
GPUContext
&
dev_ctx_
;
int
m_
;
// m
int
n_
;
// n
int
k_
;
// k
int
compute_bias_
;
std
::
vector
<
std
::
shared_ptr
<
CublasLtHelper
>>
helpers_
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/cublaslt.h
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sstream>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/dynload/cublasLt.h"
namespace
dyl
=
paddle
::
platform
::
dynload
;
namespace
paddle
{
namespace
operators
{
class
CublasLtHelper
{
public:
CublasLtHelper
(
int
m
,
int
k
,
int
n
)
:
alpha_
(
1
),
beta_
(
0
),
m_
(
m
),
k_
(
k
),
n_
(
n
)
{
cublasStatus_t
status
;
// handle and matmul desc
status
=
dyl
::
cublasLtCreate
(
&
handle_
);
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t
cudaComputeType
=
CUDA_R_32I
;
#else
cublasComputeType_t
cudaComputeType
=
CUBLAS_COMPUTE_32I
;
#endif
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
#if CUBLAS_VER_MAJOR < 11
status
=
dyl
::
cublasLtMatmulDescCreate
(
&
matmul_desc_
,
cudaComputeType
);
#else
status
=
dyl
::
cublasLtMatmulDescCreate
(
&
matmul_desc_
,
cudaComputeType
,
CUDA_R_32I
);
#endif
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmulDescCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
cublasOperation_t
op_transpose
=
CUBLAS_OP_T
;
status
=
dyl
::
cublasLtMatmulDescSetAttribute
(
matmul_desc_
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
op_transpose
,
sizeof
(
op_transpose
));
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmulDescSetAttribute execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
// matrix desc
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
B_desc_
,
CUDA_R_8I
,
k
,
n
,
k
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
A_desc_
,
CUDA_R_8I
,
k
,
m
,
k
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
status
=
dyl
::
cublasLtMatrixLayoutCreate
(
&
C_desc_
,
CUDA_R_32I
,
n
,
m
,
n
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatrixLayoutCreate execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
}
~
CublasLtHelper
()
{
if
(
handle_
)
dyl
::
cublasLtDestroy
(
handle_
);
if
(
matmul_desc_
)
dyl
::
cublasLtMatmulDescDestroy
(
matmul_desc_
);
if
(
A_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
A_desc_
);
if
(
B_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
B_desc_
);
if
(
C_desc_
)
dyl
::
cublasLtMatrixLayoutDestroy
(
C_desc_
);
}
void
GEMM
(
int8_t
*
A_dev
,
const
int8_t
*
B_dev
,
int32_t
*
C_dev
,
cudaStream_t
stream
)
{
cublasStatus_t
status
;
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
cublasLtMatmulAlgo_t
algo
;
int
algoId
=
21
;
int
swizzle
=
0
;
int
customOption
=
0
;
int
tile
=
15
;
int
splitK_val
=
0
;
int
reductionScheme
=
0
;
#if CUDA_VERSION >= 11000
int
stages
=
23
;
#endif
#if CUBLAS_VER_MAJOR < 11
cudaDataType_t
cudaComputeType
=
CUDA_R_32I
;
#else
cublasComputeType_t
cudaComputeType
=
CUBLAS_COMPUTE_32I
;
#endif
dyl
::
cublasLtMatmulAlgoInit
(
handle_
,
cudaComputeType
,
CUDA_R_32I
,
CUDA_R_8I
,
CUDA_R_8I
,
CUDA_R_32I
,
CUDA_R_32I
,
algoId
,
&
algo
);
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
,
&
(
customOption
),
sizeof
(
customOption
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_TILE_ID
,
&
(
tile
),
sizeof
(
tile
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM
,
&
(
splitK_val
),
sizeof
(
splitK_val
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
swizzle
),
sizeof
(
swizzle
));
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
reductionScheme
),
sizeof
(
int
));
#if CUDA_VERSION >= 11000
dyl
::
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
stages
),
sizeof
(
stages
));
#endif
#endif
status
=
dyl
::
cublasLtMatmul
(
handle_
,
matmul_desc_
,
&
alpha_
,
B_dev
,
B_desc_
,
A_dev
,
A_desc_
,
&
beta_
,
C_dev
,
C_desc_
,
C_dev
,
C_desc_
,
#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
&
algo
,
#else
nullptr
,
#endif
nullptr
,
0
,
stream
);
PADDLE_ENFORCE_EQ
(
status
,
CUBLAS_STATUS_SUCCESS
,
platform
::
errors
::
External
(
"cublasLtMatmul execution error"
"refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
"information"
));
}
private:
cublasLtHandle_t
handle_
;
cublasLtMatmulDesc_t
matmul_desc_
;
cublasLtMatrixLayout_t
A_desc_
;
cublasLtMatrixLayout_t
B_desc_
;
cublasLtMatrixLayout_t
C_desc_
;
int32_t
alpha_
;
int32_t
beta_
;
int
m_
;
int
k_
;
int
n_
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fused_dropout_act_bias.h
浏览文件 @
3d7e2118
...
...
@@ -60,19 +60,32 @@ struct GeluGradFunctor {
* the src, mask and dst shape is (rows, cols)
* the bias shape is (1, cols)
*/
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
Functor
>
__global__
void
FusedDropoutActBias
(
Functor
act
,
const
uint64_t
seed
,
const
uint64_t
rows
,
const
uint64_t
cols
,
const
int
increment
,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
bias
,
T
*
dst
,
MaskType
*
mask
)
{
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
void
FusedDropoutActBias
(
Functor
act
,
const
uint64_t
seed
,
const
uint64_t
rows
,
const
uint64_t
cols
,
const
int
increment
,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
bias
,
OutType
*
dst
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
int
col_id
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
row_id
=
blockIdx
.
y
;
int
idx
=
row_id
*
cols
+
col_id
;
...
...
@@ -90,7 +103,9 @@ __global__ void FusedDropoutActBias(Functor act,
VecSize
,
false
,
true
,
Functor
>
(
r
,
Functor
,
InType
,
OutType
>
(
r
,
i
,
cols
,
&
state
,
...
...
@@ -104,7 +119,14 @@ __global__ void FusedDropoutActBias(Functor act,
is_test
,
nullptr
,
nullptr
,
act
);
act
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
}
}
...
...
@@ -112,7 +134,11 @@ __global__ void FusedDropoutActBias(Functor act,
/**
* @brief dst = dropout(activation(src + bias));
*/
template
<
typename
T
,
typename
MaskType
,
typename
Functor
>
template
<
typename
T
,
typename
MaskType
,
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
void
LaunchDropoutActBias
(
Functor
act_functor
,
const
uint64_t
seed
,
const
uint32_t
rows
,
...
...
@@ -121,14 +147,21 @@ void LaunchDropoutActBias(Functor act_functor,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
bool
is_test
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
bias
,
T
*
dst
,
OutType
*
dst
,
MaskType
*
mask_data
,
const
phi
::
GPUContext
&
ctx
)
{
const
phi
::
GPUContext
&
ctx
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
// dropout_prob == 1.0f
if
(
std
::
abs
(
dropout_prob
-
1.0
f
)
<
1e-5
)
{
SetZero
<
T
>
(
ctx
,
dst
,
rows
*
cols
);
SetZero
<
T
>
(
ctx
,
reinterpret_cast
<
T
*>
(
dst
)
,
rows
*
cols
);
SetZero
<
MaskType
>
(
ctx
,
mask_data
,
rows
*
cols
);
return
;
}
...
...
@@ -137,7 +170,7 @@ void LaunchDropoutActBias(Functor act_functor,
const
int
real_vec_size
=
cols
%
VecSize
==
0
?
VecSize
:
1
;
const
auto
config
=
Get1DBlocksAnd2DGrids
(
ctx
,
rows
,
cols
,
real_vec_size
);
if
(
cols
%
VecSize
==
0
)
{
FusedDropoutActBias
<
T
,
MaskType
,
VecSize
,
Functor
>
FusedDropoutActBias
<
T
,
MaskType
,
VecSize
,
Functor
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
act_functor
,
seed
,
...
...
@@ -150,9 +183,13 @@ void LaunchDropoutActBias(Functor act_functor,
src
,
bias
,
dst
,
mask_data
);
mask_data
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
else
{
FusedDropoutActBias
<
T
,
MaskType
,
1
,
Functor
>
FusedDropoutActBias
<
T
,
MaskType
,
1
,
Functor
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
act_functor
,
seed
,
...
...
@@ -165,7 +202,11 @@ void LaunchDropoutActBias(Functor act_functor,
src
,
bias
,
dst
,
mask_data
);
mask_data
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
...
...
paddle/fluid/operators/fused/fused_dropout_common.h
浏览文件 @
3d7e2118
...
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
...
...
paddle/fluid/operators/fused/fused_dropout_helper.h
浏览文件 @
3d7e2118
...
...
@@ -109,7 +109,10 @@ struct DropoutParam {
}
};
template
<
typename
T
,
typename
MaskType
>
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
FusedDropoutHelper
{
private:
int
GetIncrement
(
const
phi
::
GPUContext
&
ctx
)
{
...
...
@@ -140,25 +143,34 @@ class FusedDropoutHelper {
// out = residual + dropout( src + bias )
void
ResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
T
*
out
,
MaskType
*
mask
)
{
OutType
*
out
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
auto
increment
=
GetIncrement
(
ctx
);
LaunchResidualDropoutBias
<
T
,
MaskType
>
(
rows_
,
cols_
,
increment
,
dropout_param_
.
seed
,
dropout_param_
.
dropout_prob
,
dropout_param_
.
is_test
,
dropout_param_
.
is_upscale_in_train
,
src
,
residual
,
bias
,
mask
,
out
,
ctx
);
LaunchResidualDropoutBias
<
T
,
MaskType
,
InType
,
OutType
>
(
rows_
,
cols_
,
increment
,
dropout_param_
.
seed
,
dropout_param_
.
dropout_prob
,
dropout_param_
.
is_test
,
dropout_param_
.
is_upscale_in_train
,
src
,
residual
,
bias
,
mask
,
out
,
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
void
ResidualDropoutBiasGrad
(
const
phi
::
GPUContext
&
ctx
,
...
...
@@ -189,15 +201,22 @@ class FusedDropoutHelper {
// out = dropout(activation(src + bias))
void
DropoutActBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
bias
,
const
std
::
string
&
act_method
,
T
*
out
,
MaskType
*
mask
)
{
OutType
*
out
,
MaskType
*
mask
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
auto
increment
=
GetIncrement
(
ctx
);
if
(
act_method
==
"gelu"
)
{
GeluFunctor
<
T
>
gelu
;
LaunchDropoutActBias
<
T
,
MaskType
,
GeluFunctor
<
T
>>
(
LaunchDropoutActBias
<
T
,
MaskType
,
GeluFunctor
<
T
>
,
InType
,
OutType
>
(
gelu
,
dropout_param_
.
seed
,
rows_
,
...
...
@@ -210,23 +229,40 @@ class FusedDropoutHelper {
bias
,
out
,
mask
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
if
(
act_method
==
"relu"
)
{
phi
::
funcs
::
ReluFunctor
<
T
>
relu
;
LaunchDropoutActBias
<
T
,
MaskType
,
phi
::
funcs
::
ReluFunctor
<
T
>>
(
relu
,
dropout_param_
.
seed
,
rows_
,
cols_
,
increment
,
dropout_param_
.
dropout_prob
,
dropout_param_
.
is_upscale_in_train
,
dropout_param_
.
is_test
,
src
,
bias
,
out
,
mask
,
ctx
);
LaunchDropoutActBias
<
T
,
MaskType
,
phi
::
funcs
::
ReluFunctor
<
T
>
,
InType
,
OutType
>
(
relu
,
dropout_param_
.
seed
,
rows_
,
cols_
,
increment
,
dropout_param_
.
dropout_prob
,
dropout_param_
.
is_upscale_in_train
,
dropout_param_
.
is_test
,
src
,
bias
,
out
,
mask
,
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Currently only supports gelu or relu activation functions!"
));
...
...
@@ -283,8 +319,12 @@ class FusedDropoutHelper {
DropoutParam
dropout_param_
;
};
template
<
typename
T
,
typename
MaskType
>
class
FusedDropoutLayerNormHelper
:
public
FusedDropoutHelper
<
T
,
MaskType
>
{
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
class
FusedDropoutLayerNormHelper
:
public
FusedDropoutHelper
<
T
,
MaskType
,
InType
,
OutType
>
{
public:
FusedDropoutLayerNormHelper
()
{}
FusedDropoutLayerNormHelper
(
const
int
rows
,
...
...
@@ -301,23 +341,24 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
const
int
cols
,
const
DropoutParam
&
dropout_param
,
const
float
epsilon
)
:
FusedDropoutHelper
<
T
,
MaskType
>
(
ctx
,
rows
,
cols
,
dropout_param
)
{
:
FusedDropoutHelper
<
T
,
MaskType
,
InType
,
OutType
>
(
ctx
,
rows
,
cols
,
dropout_param
)
{
using
U
=
LayerNormParamType
<
T
>
;
epsilon_
=
epsilon
;
}
// call layer_norm
void
LayerNorm
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
InType
*
src
,
const
LayerNormParamType
<
T
>*
gamma
,
const
LayerNormParamType
<
T
>*
beta
,
T
*
out
,
OutType
*
out
,
LayerNormParamType
<
T
>*
mean
,
LayerNormParamType
<
T
>*
variance
)
{
using
U
=
LayerNormParamType
<
T
>
;
switch
(
GetDesiredBlockDim
(
this
->
cols_
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
U
,
kBlockDim
>
LayerNormForward
<
T
,
U
,
kBlockDim
,
false
,
InType
,
OutType
>
<<<
this
->
rows_
,
kBlockDim
,
0
,
ctx
.
stream
()
>>>
(
src
,
gamma
,
beta
,
out
,
mean
,
variance
,
epsilon_
,
this
->
cols_
));
}
...
...
@@ -349,17 +390,25 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
// out = layernorm(residual + dropout(src + bias))
template
<
typename
P
=
LayerNormParamType
<
T
>,
bool
is_same_type
=
false
>
void
LayernormResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
T
*
src
,
const
T
*
residual
,
const
T
*
bias
,
const
P
*
gamma
,
const
P
*
beta
,
T
*
dropout_out
,
MaskType
*
mask
,
T
*
out
,
LayerNormParamType
<
T
>*
mean
,
LayerNormParamType
<
T
>*
variance
)
{
void
LayernormResidualDropoutBias
(
const
phi
::
GPUContext
&
ctx
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
const
P
*
gamma
,
const
P
*
beta
,
T
*
dropout_out
,
MaskType
*
mask
,
OutType
*
out
,
LayerNormParamType
<
T
>*
mean
,
LayerNormParamType
<
T
>*
variance
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
using
U
=
LayerNormParamType
<
T
>
;
int
vec_size
=
MAX_CACHE_BYTES
/
sizeof
(
T
);
if
(
this
->
cols_
%
vec_size
!=
0
)
{
...
...
@@ -368,7 +417,12 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
int
threads
=
GetDesiredBlockDim
(
this
->
cols_
/
vec_size
);
int
increment
=
((
this
->
cols_
-
1
)
/
(
threads
*
vec_size
)
+
1
)
*
vec_size
;
increment
=
this
->
dropout_param_
.
UpdateSeedAndIncrement
(
ctx
,
increment
);
LaunchLayernormResidualDropoutBias
<
T
,
MaskType
,
U
,
is_same_type
>
(
LaunchLayernormResidualDropoutBias
<
T
,
MaskType
,
U
,
is_same_type
,
InType
,
OutType
>
(
this
->
rows_
,
this
->
cols_
,
increment
,
...
...
@@ -387,7 +441,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
out
,
mean
,
variance
,
ctx
);
ctx
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
template
<
typename
P
=
LayerNormParamType
<
T
>,
bool
is_same_type
=
false
>
...
...
paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
0 → 100644
浏览文件 @
3d7e2118
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
FusedMultiTransformerINT8Op
:
public
framework
::
OperatorWithKernel
{
private:
static
constexpr
const
char
*
OpName
=
"FusedMultiTransformerINT8Op"
;
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
#define CHECK_INPUT(name) \
OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
#define CHECK_INPUTS(name) \
OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
#define CHECK_OUTPUT(name) \
OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
#define CHECK_OUTPUTS(name) \
OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
CHECK_INPUT
(
X
);
// attention
CHECK_INPUTS
(
QKVW
);
CHECK_INPUTS
(
OutLinearW
);
if
(
ctx
->
HasInput
(
"TimeStep"
))
{
CHECK_INPUTS
(
CacheKV
);
}
if
(
ctx
->
HasInputs
(
"CacheKV"
))
{
CHECK_OUTPUTS
(
CacheKVOut
);
}
// ffn
CHECK_INPUTS
(
FFN1Weight
);
CHECK_INPUTS
(
FFN2Weight
);
CHECK_OUTPUT
(
Out
);
// x: qkv's input [batch_size, seq_len, dim_embed]
// y: qkv's weight: [3, num_head, dim_head, dim_embed]
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
y_dim
=
ctx
->
GetInputsDim
(
"QKVW"
)[
0
];
bool
trans_qkvw
=
ctx
->
Attrs
().
Get
<
bool
>
(
"trans_qkvw"
);
PADDLE_ENFORCE_EQ
(
x_dim
.
size
(),
3
,
platform
::
errors
::
InvalidArgument
(
"The dimensions of x must be 3"
"(batch_size, seq_len, dim_embed),"
"but received dimensions of"
"Input is [%d]"
,
x_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
y_dim
.
size
(),
4
,
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"but received dimensions of"
"Input is [%d]"
,
y_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
x_dim
[
2
],
trans_qkvw
?
y_dim
[
3
]
:
y_dim
[
0
],
platform
::
errors
::
InvalidArgument
(
"ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
"true) or y_dim[0](trans_qkvw is false)"
"must be equal. But received: the shape "
"of input x = [%s], and the shape of "
"input qkv_weight = [%s]"
,
x_dim
,
y_dim
));
if
(
ctx
->
Attrs
().
Get
<
int
>
(
"ring_id"
)
==
-
1
)
{
if
(
trans_qkvw
)
{
PADDLE_ENFORCE_EQ
(
y_dim
[
1
]
*
y_dim
[
2
],
y_dim
[
3
],
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(3, num_head, dim_head, dim_embed),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"
));
}
else
{
PADDLE_ENFORCE_EQ
(
y_dim
[
2
]
*
y_dim
[
3
],
y_dim
[
0
],
platform
::
errors
::
InvalidArgument
(
"The dimensions of qkv_weight must be 4"
"(dim_embed, 3, num_head, dim_head),"
"and must satisfy the limitations: "
"(num_head * dim_head == dim_embed)"
));
}
}
if
(
ctx
->
HasInputs
(
"CacheKV"
))
{
// [2, batch_size, num_head, max_seq_len, head_size]
const
auto
&
c_dims
=
ctx
->
GetInputsDim
(
"CacheKV"
);
const
auto
&
c_dim
=
c_dims
[
0
];
PADDLE_ENFORCE_EQ
(
c_dim
.
size
(),
5
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The CacheKV must be 5 dims, but got %d"
,
c_dim
.
size
()));
PADDLE_ENFORCE_EQ
(
c_dim
[
0
],
2
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The first dim of CacheKV must be 2, but got %d"
,
c_dim
[
0
]));
// 2
PADDLE_ENFORCE_EQ
(
c_dim
[
1
],
x_dim
[
0
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The second dim of CacheKV must be equal with "
"batch size %d, but got %d"
,
x_dim
[
0
],
c_dim
[
1
]));
// batch_size
PADDLE_ENFORCE_EQ
(
c_dim
[
2
],
trans_qkvw
?
y_dim
[
1
]
:
y_dim
[
2
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The third dim of CacheKV must be equal with num "
"head %d, but got %d"
,
trans_qkvw
?
y_dim
[
1
]
:
y_dim
[
2
],
c_dim
[
2
]));
// num_head
PADDLE_ENFORCE_GT
(
c_dim
[
3
],
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The forth dim of CacheKV must be greater than 0, but got %d"
,
c_dim
[
3
]));
// cache_seq_len
PADDLE_ENFORCE_EQ
(
c_dim
[
4
],
trans_qkvw
?
y_dim
[
2
]
:
y_dim
[
3
],
paddle
::
platform
::
errors
::
InvalidArgument
(
"The fifth dim of CacheKV must be equal with head "
"size %d, but got %d"
,
trans_qkvw
?
y_dim
[
2
]
:
y_dim
[
3
],
c_dim
[
4
]));
// head_size
}
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
());
}
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"TimeStep"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
}
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
};
class
FusedMultiTransformerINT8OpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input tensor."
);
AddInput
(
"LnScale"
,
"Scale is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor."
)
.
AsDuplicable
();
AddInput
(
"LnBias"
,
"Bias is a 1-dimensional tensor of size "
"H. Here, H represents the last dimension of its input tensor."
)
.
AsDuplicable
();
AddInput
(
"QKVW"
,
"The qkv weight tensor."
).
AsDuplicable
();
AddInput
(
"QKVBias"
,
"The qkv bias tensor."
).
AsDispensable
().
AsDuplicable
();
AddInput
(
"CacheKV"
,
"(optional) The cached KV for generation inference."
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"TimeStep"
,
"(optional, int) The time step for generation inference."
)
.
AsDispensable
();
AddInput
(
"SrcMask"
,
"(optional) The attention mask tensor in fmha."
)
.
AsDispensable
();
AddInput
(
"OutLinearW"
,
"The out_linear weight tensor."
).
AsDuplicable
();
AddInput
(
"OutLinearBias"
,
"The out_linear bias tensor."
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"FFNLnScale"
,
"The layer_norm scale of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFNLnBias"
,
"The layer_norm bias of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN1Weight"
,
"The linear1 weight of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN1Bias"
,
"The linear1 bias of FusedFeedForward op"
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"FFN2Weight"
,
"The linear2 weight of FusedFeedForward op"
)
.
AsDuplicable
();
AddInput
(
"FFN2Bias"
,
"The linear2 bias input of FusedFeedForward op"
)
.
AsDispensable
()
.
AsDuplicable
();
AddInput
(
"QKVOutScale"
,
"QKVOutScale is used to dequantize qkv output tensor."
"In order to keep consistent with the PTQ/QAT calculation logic,"
"QKVOutScale should be max_bound * max_bound / max_range."
"Here max_range is per-channel weight scale."
"The shape of QKVOutScale is [num_layers, num_channels]"
)
.
AsDispensable
();
AddInput
(
"OutLinearOutScale"
,
"OutLinearOutScale is used to dequantize out_linear output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddInput
(
"FFN1OutScale"
,
"FFN1OutScale is used to dequantize ffn1 output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddInput
(
"FFN2OutScale"
,
"FFN2OutScale is used to dequantize ffn2 output tensor."
"The definition and shape is the same as QKVOutScale"
)
.
AsDispensable
();
AddOutput
(
"CacheKVOut"
,
"The updated cache KV. Inplace with CacheKV"
)
.
AsDispensable
()
.
AsDuplicable
();
AddOutput
(
"Out"
,
"Result after multi ."
);
AddAttr
<
bool
>
(
"pre_layer_norm"
,
"if true, the attention op uses pre_layer_norm architecure, "
"else, uses post_layer_norm architecuture. "
"[default true]."
)
.
SetDefault
(
true
);
AddAttr
<
float
>
(
"epsilon"
,
"Constant for numerical stability [default 1e-5]."
)
.
SetDefault
(
1e-5
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE_EQ
(
epsilon
>=
0.0
f
&&
epsilon
<=
0.001
f
,
true
,
platform
::
errors
::
InvalidArgument
(
"'epsilon' in Op(LayerNorm) should be between"
"0.0 and 0.001, But received [%s]."
,
epsilon
));
});
AddAttr
<
float
>
(
"dropout_rate"
,
"Probability of setting units to zero."
)
.
SetDefault
(
.5
f
)
.
AddCustomChecker
([](
const
float
&
drop_p
)
{
PADDLE_ENFORCE_EQ
(
drop_p
>=
0.0
f
&&
drop_p
<=
1.0
f
,
true
,
platform
::
errors
::
InvalidArgument
(
"'dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"dropout_implementation"
,
"[
\"
downgrade_in_infer
\"
|
\"
upscale_in_train
\"
]"
"The meaning is the same as 'attn_dropout_implementation'."
)
.
SetDefault
(
"downgrade_in_infer"
)
.
AddCustomChecker
([](
const
std
::
string
&
type
)
{
PADDLE_ENFORCE_EQ
(
type
==
"downgrade_in_infer"
||
type
==
"upscale_in_train"
,
true
,
platform
::
errors
::
InvalidArgument
(
"dropout_implementation can only be downgrade_in_infer or "
"upscale_in_train"
));
});
AddAttr
<
std
::
string
>
(
"act_method"
,
"act_method"
).
SetDefault
(
"gelu"
);
AddAttr
<
bool
>
(
"trans_qkvw"
,
"Whether the weights of qkv should be transposed. If true,"
"the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
"Otherwise the shape of weights of qkv should be"
"[dim_embed, 3, num_head, dim_head]"
)
.
SetDefault
(
true
);
AddAttr
<
int
>
(
"ring_id"
,
"ring id for tensor model parallel. distributed training and inference"
)
.
SetDefault
(
-
1
);
AddAttr
<
int
>
(
"num_head"
,
"num_head"
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dim_head"
,
"dim_head"
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dim_ffn"
,
"dim_ffn"
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
float
>>
(
"qkv_in_scale"
,
"qkv_in_scale is used to quantize qkv input tensor."
"in_scale is generated by PTQ or QAT, which represents valid max range "
"of this tensor."
"the size of qkv_in_scale should be num_layers, which is equal to "
"QKVW.dims()[0]"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"out_linear_in_scale"
,
"out_linear_in_scale is used to quantize out_linear input tensor."
"the size of out_linear_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"ffn1_in_scale"
,
"ffn1_in_scale is used to quantize ffn1 input tensor."
"the size of ffn1_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
float
>>
(
"ffn2_in_scale"
,
"ffn2_in_scale is used to quantize ffn2 input tensor."
"the size of ffn2_in_scale is the same as qkv_in_scale"
)
.
SetDefault
({});
AddAttr
<
int
>
(
"quant_round_type"
,
"(int, default 1) The round type of fp32 to int."
"0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
"1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
"round(-2.5)=-3"
)
.
SetDefault
(
1
);
AddAttr
<
float
>
(
"quant_max_bound"
,
"(float, default 127.0) the max bound of float type to int type"
)
.
SetDefault
(
127.0
);
AddAttr
<
float
>
(
"quant_min_bound"
,
"(float, default -127.0) the min bound of float type to int type"
)
.
SetDefault
(
-
127.0
);
AddComment
(
R"DOC(fused multi transformer layers op)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_multi_transformer_int8
,
ops
::
FusedMultiTransformerINT8Op
,
ops
::
FusedMultiTransformerINT8OpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_multi_transformer_op.h
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/fused/fused_residual_dropout_bias.h
浏览文件 @
3d7e2118
...
...
@@ -28,7 +28,9 @@ template <typename T,
int
VecSize
,
bool
ComputeLayerNorm
,
bool
Activation
,
typename
Functor
>
typename
Functor
,
typename
InType
=
T
,
typename
OutType
=
T
>
__forceinline__
__device__
void
FusedResidualDropoutBiasOneThread
(
const
int
row_id
,
const
int
col_id
,
...
...
@@ -36,30 +38,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
curandStatePhilox4_32_10_t
*
state
,
const
float
dropout_prob
,
const
T
factor
,
const
T
*
__restrict__
src
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
residual
,
const
T
*
__restrict__
bias
,
T
*
dst
,
OutType
*
dst
,
MaskType
*
mask
,
const
bool
is_test
,
typename
details
::
MPTypeTrait
<
T
>::
Type
*
mean_val
,
typename
details
::
MPTypeTrait
<
T
>::
Type
*
var_val
,
Functor
act_func
)
{
Functor
act_func
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
,
const
int
quant_round_type
=
1
,
const
float
quant_max_bound
=
127.0
,
const
float
quant_min_bound
=
-
127.0
)
{
using
LoadT
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
using
LoadInType
=
phi
::
AlignedVector
<
InType
,
VecSize
>
;
using
LoadFloat
=
phi
::
AlignedVector
<
float
,
VecSize
>
;
using
StoreT
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
using
StoreOutType
=
phi
::
AlignedVector
<
OutType
,
VecSize
>
;
using
MaskStoreT
=
phi
::
AlignedVector
<
MaskType
,
VecSize
>
;
using
U
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
Load
T
src_vec
;
Load
InType
src_vec
;
LoadT
residual_vec
;
LoadT
bias_vec
;
LoadFloat
quant_out_scale_vec
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
VecSize
;
ii
++
)
{
bias_vec
[
ii
]
=
static_cast
<
T
>
(
0
);
residual_vec
[
ii
]
=
static_cast
<
T
>
(
0
);
}
// vectorize load data from global
phi
::
Load
<
T
,
VecSize
>
(
&
src
[
row_id
*
cols
+
col_id
],
&
src_vec
);
phi
::
Load
<
InType
,
VecSize
>
(
&
src
[
row_id
*
cols
+
col_id
],
&
src_vec
);
phi
::
Load
<
float
,
VecSize
>
(
&
dequant_out_scale_data
[
quant_out_scale_offset
+
col_id
],
&
quant_out_scale_vec
);
if
(
residual
)
{
phi
::
Load
<
T
,
VecSize
>
(
&
residual
[
row_id
*
cols
+
col_id
],
&
residual_vec
);
}
...
...
@@ -84,10 +101,18 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
}
StoreT
dest_vec
;
StoreOutType
dest_vec_out_type
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
VecSize
;
ii
++
)
{
T
tmp
=
src_vec
[
ii
]
+
bias_vec
[
ii
];
T
tmp
;
if
(
std
::
is_same
<
InType
,
int32_t
>::
value
)
{
T
tmp0
=
static_cast
<
T
>
(
static_cast
<
float
>
(
src_vec
[
ii
])
*
quant_last_in_scale
/
quant_out_scale_vec
[
ii
]);
tmp
=
tmp0
+
bias_vec
[
ii
];
}
else
{
tmp
=
static_cast
<
T
>
(
src_vec
[
ii
])
+
bias_vec
[
ii
];
}
if
(
Activation
)
{
tmp
=
act_func
(
tmp
);
}
...
...
@@ -98,10 +123,23 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
*
mean_val
+=
tmp
;
*
var_val
+=
(
tmp
*
tmp
);
}
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
dest_vec_out_type
[
ii
]
=
quant_helper
(
dest_vec
[
ii
],
quant_next_in_scale
,
quant_round_type
,
quant_max_bound
,
quant_min_bound
);
}
}
// store result to global
phi
::
Store
<
T
,
VecSize
>
(
dest_vec
,
&
dst
[
row_id
*
cols
+
col_id
]);
if
(
std
::
is_same
<
OutType
,
int8_t
>::
value
)
{
phi
::
Store
<
OutType
,
VecSize
>
(
dest_vec_out_type
,
&
dst
[
row_id
*
cols
+
col_id
]);
}
else
{
phi
::
Store
<
T
,
VecSize
>
(
dest_vec
,
reinterpret_cast
<
T
*>
(
&
dst
[
row_id
*
cols
+
col_id
]));
}
if
(
!
is_test
)
{
phi
::
Store
<
MaskType
,
VecSize
>
(
mask_vec
,
&
mask
[
row_id
*
cols
+
col_id
]);
}
...
...
@@ -114,19 +152,28 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
* is_test: only used in inference
* mask: can be null if is_test=true
*/
template
<
typename
T
,
typename
MaskType
,
int
VecSize
>
__global__
void
FusedResidualDropoutBias
(
const
size_t
rows
,
const
size_t
cols
,
uint64_t
seed
,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
residual
,
const
T
*
__restrict__
bias
,
MaskType
*
mask
,
T
*
dst
,
uint64_t
increment
,
const
bool
is_test
)
{
template
<
typename
T
,
typename
MaskType
,
int
VecSize
,
typename
InType
=
T
,
typename
OutType
=
T
>
__global__
void
FusedResidualDropoutBias
(
const
size_t
rows
,
const
size_t
cols
,
uint64_t
seed
,
const
float
dropout_prob
,
const
bool
is_upscale_in_train
,
const
InType
*
__restrict__
src
,
const
T
*
__restrict__
residual
,
const
T
*
__restrict__
bias
,
MaskType
*
mask
,
OutType
*
dst
,
uint64_t
increment
,
const
bool
is_test
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
int
col_id
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
row_id
=
blockIdx
.
y
;
int
idx
=
row_id
*
cols
+
col_id
;
...
...
@@ -142,22 +189,27 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
VecSize
,
false
,
false
,
phi
::
funcs
::
ReluFunctor
<
T
>>
(
r
,
i
,
cols
,
&
state
,
dropout_prob
,
factor
,
src
,
residual
,
bias
,
dst
,
mask
,
is_test
,
nullptr
,
nullptr
,
relu
);
phi
::
funcs
::
ReluFunctor
<
T
>
,
InType
,
OutType
>
(
r
,
i
,
cols
,
&
state
,
dropout_prob
,
factor
,
src
,
residual
,
bias
,
dst
,
mask
,
is_test
,
nullptr
,
nullptr
,
relu
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
}
...
...
@@ -165,7 +217,10 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
/**
* @brief dst = residual + dropout(src + bias);
*/
template
<
typename
T
,
typename
MaskType
>
template
<
typename
T
,
typename
MaskType
,
typename
InType
=
T
,
typename
OutType
=
T
>
void
LaunchResidualDropoutBias
(
const
uint32_t
rows
,
const
uint32_t
cols
,
const
int
increment
,
...
...
@@ -173,14 +228,19 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const
float
dropout_prob
,
const
bool
is_test
,
bool
is_upscale_in_train
,
const
T
*
src
,
const
InType
*
src
,
const
T
*
residual
,
const
T
*
bias
,
MaskType
*
mask_data
,
T
*
dst
,
const
phi
::
GPUContext
&
ctx
)
{
OutType
*
dst
,
const
phi
::
GPUContext
&
ctx
,
const
float
quant_last_in_scale
=
1.0
,
const
float
*
dequant_out_scale_data
=
nullptr
,
const
int
quant_out_scale_offset
=
0
,
const
float
quant_next_in_scale
=
1.0
)
{
// dropout_prob == 1.0f
if
(
std
::
abs
(
dropout_prob
-
1.0
f
)
<
1e-5
)
{
// NOTE(minghaoBD): OutType should be T if dropout_prob == 1.0
if
(
residual
==
dst
)
return
;
if
(
residual
)
{
memory
::
Copy
(
ctx
.
GetPlace
(),
...
...
@@ -202,7 +262,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
const
int
real_vec_size
=
cols
%
VecSize
==
0
?
VecSize
:
1
;
auto
config
=
Get1DBlocksAnd2DGrids
(
ctx
,
rows
,
cols
,
real_vec_size
);
if
(
cols
%
VecSize
==
0
)
{
FusedResidualDropoutBias
<
T
,
uint8_t
,
VecSize
>
FusedResidualDropoutBias
<
T
,
uint8_t
,
VecSize
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
...
...
@@ -215,9 +275,13 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data
,
dst
,
increment
,
is_test
);
is_test
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
else
{
FusedResidualDropoutBias
<
T
,
uint8_t
,
1
>
FusedResidualDropoutBias
<
T
,
uint8_t
,
1
,
InType
,
OutType
>
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
rows
,
cols
,
...
...
@@ -230,7 +294,11 @@ void LaunchResidualDropoutBias(const uint32_t rows,
mask_data
,
dst
,
increment
,
is_test
);
is_test
,
quant_last_in_scale
,
dequant_out_scale_data
,
quant_out_scale_offset
,
quant_next_in_scale
);
}
}
...
...
paddle/fluid/operators/fused/quant_dequant_kernel.h
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/operators/layer_norm_kernel.cu.h
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/fluid/platform/dynload/cublasLt.h
浏览文件 @
3d7e2118
...
...
@@ -40,26 +40,28 @@ namespace dynload {
// APIs available after CUDA 10.1
// #if CUDA_VERSION >= 10100
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatmulDescGetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixLayoutGetAttribute); \
__macro(cublasLtMatmulPreferenceCreate); \
__macro(cublasLtMatmulPreferenceDestroy); \
__macro(cublasLtMatmulPreferenceSetAttribute); \
__macro(cublasLtMatmulAlgoGetHeuristic); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute);
#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasLtCreate); \
__macro(cublasLtDestroy); \
__macro(cublasLtMatmul); \
__macro(cublasLtMatmulDescCreate); \
__macro(cublasLtMatmulDescDestroy); \
__macro(cublasLtMatmulDescSetAttribute); \
__macro(cublasLtMatmulDescGetAttribute); \
__macro(cublasLtMatrixLayoutCreate); \
__macro(cublasLtMatrixLayoutDestroy); \
__macro(cublasLtMatrixLayoutSetAttribute); \
__macro(cublasLtMatrixLayoutGetAttribute); \
__macro(cublasLtMatmulPreferenceCreate); \
__macro(cublasLtMatmulPreferenceDestroy); \
__macro(cublasLtMatmulPreferenceSetAttribute); \
__macro(cublasLtMatmulAlgoGetHeuristic); \
__macro(cublasLtMatrixTransform); \
__macro(cublasLtMatrixTransformDescCreate); \
__macro(cublasLtMatrixTransformDescDestroy); \
__macro(cublasLtMatrixTransformDescSetAttribute); \
__macro(cublasLtMatmulAlgoInit); \
__macro(cublasLtMatmulAlgoConfigSetAttribute);
CUBLASLT_BLAS_ROUTINE_EACH
(
PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
)
// #endif
...
...
paddle/fluid/pybind/op_function_generator.h
浏览文件 @
3d7e2118
...
...
@@ -71,6 +71,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"FFN1Bias"
,
"FFN2Weight"
,
"FFN2Bias"
}},
{
"fused_multi_transformer_int8"
,
{
"X"
,
"LnScale"
,
"LnBias"
,
"QKVW"
,
"QKVBias"
,
"CacheKV"
,
"TimeStep"
,
"SrcMask"
,
"OutLinearW"
,
"OutLinearBias"
,
"FFNLnScale"
,
"FFNLnBias"
,
"FFN1Weight"
,
"FFN1Bias"
,
"FFN2Weight"
,
"FFN2Bias"
,
"QKVOutScale"
,
"OutLinearOutScale"
,
"FFN1OutScale"
,
"FFN2OutScale"
}},
{
"fused_bias_dropout_residual_layer_norm"
,
{
"X"
,
"Residual"
,
"Bias"
,
"LnScale"
,
"LnBias"
}},
{
"instance_norm"
,
{
"X"
,
"Scale"
,
"Bias"
}},
...
...
@@ -329,6 +335,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"Beta2PowOut"
,
"MasterParamOut"
}},
{
"fused_multi_transformer"
,
{
"CacheKVOut"
,
"Out"
}},
{
"fused_multi_transformer_int8"
,
{
"CacheKVOut"
,
"Out"
}},
{
"resnet_basic_block"
,
{
"Y"
,
"Conv1"
,
"SavedMean1"
,
"SavedInvstd1"
,
"Mean1Out"
,
"Var1Out"
,
"Conv2"
,
"SavedMean2"
,
"SavedInvstd2"
,
"Mean2Out"
,
...
...
@@ -433,6 +440,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{
"split"
,
{
"Out"
}},
{
"concat"
,
{
"Out"
}},
{
"fused_multi_transformer"
,
{
"CacheKVOut"
}},
{
"fused_multi_transformer_int8"
,
{
"CacheKVOut"
}},
{
"group_norm"
,
{
"Mean"
,
"Variance"
}},
{
"resnet_basic_block"
,
{
"Mean1Out"
,
"Var1Out"
,
"Mean2Out"
,
"Var2Out"
,
"Mean3Out"
,
"Var3Out"
}},
...
...
paddle/phi/backends/dynload/cublasLt.h
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
paddle/phi/backends/dynload/dynamic_loader.cc
浏览文件 @
3d7e2118
...
...
@@ -326,7 +326,7 @@ void* GetCublasDsoHandle() {
void
*
GetCublasLtDsoHandle
()
{
// APIs available after CUDA 10.1
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10
10
0
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10
01
0
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcublasLt.so"
);
#else
std
::
string
warning_msg
(
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
0 → 100644
浏览文件 @
3d7e2118
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录