Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
f529675c
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f529675c
编写于
9月 19, 2020
作者:
S
seiriosPlus
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into optimize/large_scale_kv_spped
上级
c419044d
99626502
变更
19
展开全部
隐藏空白更改
内联
并排
Showing
19 changed file
with
1355 addition
and
522 deletion
+1355
-522
cmake/cuda.cmake
cmake/cuda.cmake
+3
-0
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+58
-12
paddle/fluid/framework/fleet/gloo_wrapper.cc
paddle/fluid/framework/fleet/gloo_wrapper.cc
+22
-3
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
...fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+3
-3
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
...inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+133
-81
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
.../inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+144
-34
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
...nce/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+4
-6
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+44
-1
python/paddle/distributed/fleet/__init__.py
python/paddle/distributed/fleet/__init__.py
+1
-0
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+6
-16
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+397
-195
python/paddle/distributed/fleet/base/util_factory.py
python/paddle/distributed/fleet/base/util_factory.py
+5
-34
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+25
-1
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+8
-2
python/paddle/fluid/tests/unittests/test_fleet_base.py
python/paddle/fluid/tests/unittests/test_fleet_base.py
+8
-11
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
.../paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+274
-9
python/paddle/fluid/tests/unittests/test_fleet_util.py
python/paddle/fluid/tests/unittests/test_fleet_util.py
+3
-94
python/paddle/fluid/tests/unittests/test_transformer_api.py
python/paddle/fluid/tests/unittests/test_transformer_api.py
+135
-0
python/paddle/nn/layer/transformer.py
python/paddle/nn/layer/transformer.py
+82
-20
未找到文件。
cmake/cuda.cmake
浏览文件 @
f529675c
...
...
@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Maxwell"
)
set
(
cuda_arch_bin
"50"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Pascal"
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
LESS 10.0
)
add_definitions
(
"-DSUPPORTS_CUDA_FP16"
)
endif
()
set
(
cuda_arch_bin
"60 61"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Volta"
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
LESS 10.0
)
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
f529675c
...
...
@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
VLOG
(
0
)
<<
"error: the number of ids is a negative number: "
<<
num
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
return
false
;
}
else
if
(
num
==
0
)
{
VLOG
(
0
)
...
...
@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
"characters."
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
return
false
;
}
else
if
(
errno
==
ERANGE
||
num
>
INT_MAX
)
{
VLOG
(
0
)
<<
"error: the number of ids greater than INT_MAX"
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
return
false
;
}
if
(
all_slots_type_
[
i
]
==
"float"
)
{
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
strtof
(
endptr
,
&
endptr
);
if
(
errno
==
ERANGE
)
{
VLOG
(
0
)
<<
"error: the value is out of the range of "
"representable values for float"
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
VLOG
(
0
)
<<
"and in this slot: "
<<
j
<<
" th id with total id number: "
<<
num
;
return
false
;
}
if
(
i
+
1
!=
num
&&
endptr
-
str
==
len
)
{
if
(
j
+
1
!=
num
&&
endptr
-
str
==
len
)
{
VLOG
(
0
)
<<
"error: there is a wrong with the number of ids."
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
VLOG
(
0
)
<<
"and in this slot: "
<<
j
<<
" th id with total id number: "
<<
num
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
}
}
else
if
(
all_slots_type_
[
i
]
==
"uint64"
)
{
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
strtoull
(
endptr
,
&
endptr
,
10
);
if
(
errno
==
ERANGE
)
{
VLOG
(
0
)
<<
"error: the value is out of the range of "
"representable values for uint64_t"
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
VLOG
(
0
)
<<
"and in this slot: "
<<
j
<<
" th id with total id number: "
<<
num
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
if
(
i
+
1
!=
num
&&
endptr
-
str
==
len
)
{
if
(
j
+
1
!=
num
&&
endptr
-
str
==
len
)
{
VLOG
(
0
)
<<
"error: there is a wrong with the number of ids."
;
VLOG
(
0
)
<<
"Error occured when parsing "
<<
i
<<
" th slot with total slots number: "
<<
all_slots_
.
size
();
VLOG
(
0
)
<<
"and in this slot: "
<<
j
<<
" th id with total id number: "
<<
num
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
...
...
@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.
\n
please check this error line: %s"
,
str
));
"characters.
\n
please check this error line: %s,
\n
Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot"
,
"
\n
We detect the feasign number of this slot is %d, "
"which is illegal."
,
str
,
i
,
num
));
if
(
idx
!=
-
1
)
{
(
*
instance
)[
idx
].
Init
(
all_slots_type_
[
i
]);
if
((
*
instance
)[
idx
].
GetType
()[
0
]
==
'f'
)
{
// float
...
...
@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.
\n
please check this error line: %s."
,
str
));
"characters.
\n
please check this error line: %s,
\n
Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot"
,
"
\n
We detect the feasign number of this slot is %d, "
"which is illegal."
,
str
,
i
,
num
));
if
(
idx
!=
-
1
)
{
(
*
instance
)[
idx
].
Init
(
all_slots_type_
[
i
]);
...
...
@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.
\n
please check this error line: %s."
,
str
));
"characters.
\n
please check this error line: %s,
\n
Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot"
,
"
\n
We detect the feasign number of this slot is %d, "
"which is illegal."
,
str
,
i
,
num
));
if
(
idx
!=
-
1
)
{
if
(
all_slots_type_
[
i
][
0
]
==
'f'
)
{
// float
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
...
...
@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.
\n
please check this error line: %s."
,
str
));
"characters.
\n
please check this error line: %s,
\n
Specifically, "
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot"
,
"
\n
We detect the feasign number of this slot is %d, "
"which is illegal."
,
str
,
i
,
num
));
if
(
idx
!=
-
1
)
{
if
(
all_slots_type_
[
i
][
0
]
==
'f'
)
{
// float
...
...
paddle/fluid/framework/fleet/gloo_wrapper.cc
浏览文件 @
f529675c
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
namespace
gloo
{
namespace
rendezvous
{
constexpr
int
kNodeSize
=
136
;
HdfsStore
::
HdfsStore
(
const
std
::
string
&
path
)
{
path_
=
path
;
wait_sleep_ms_
=
10000
;
...
...
@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
storeKey
<<
rank
;
store
.
set
(
storeKey
.
str
(),
allBytes
);
auto
total_add_size
=
kNodeSize
*
(
size
-
1
);
std
::
vector
<
std
::
shared_ptr
<
std
::
thread
>>
connect_threads
(
thread_num_
);
// Connect every pair
for
(
uint32_t
i
=
0
;
i
<
connect_threads
.
size
();
++
i
)
{
connect_threads
[
i
].
reset
(
new
std
::
thread
(
[
&
store
,
&
transportContext
,
t
his
](
size_t
thread_idx
,
size_t
thread_num
)
->
void
{
[
&
store
,
&
transportContext
,
t
otal_add_size
,
this
](
size_t
thread_idx
,
size_t
thread_num
)
->
void
{
for
(
int
i
=
thread_idx
;
i
<
size
;
i
+=
thread_num
)
{
if
(
i
==
rank
)
{
continue
;
...
...
@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
// Wait for address of other side of this pair to become available
std
::
string
key
=
std
::
to_string
(
i
);
store
.
wait
({
key
},
getTimeout
());
std
::
vector
<
char
>
allAddrs
;
auto
max_retry_times
=
5
;
// Connect to other side of this pair
auto
allAddrs
=
store
.
get
(
key
);
while
(
max_retry_times
>
0
)
{
allAddrs
=
store
.
get
(
key
);
VLOG
(
3
)
<<
"store get all address size: "
<<
allAddrs
.
size
()
<<
" except: "
<<
total_add_size
;
if
(
allAddrs
.
size
()
==
static_cast
<
size_t
>
(
total_add_size
))
{
break
;
}
--
max_retry_times
;
}
auto
addr
=
extractAddress
(
allAddrs
,
i
);
transportContext
->
getPair
(
i
)
->
connect
(
addr
);
}
...
...
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
浏览文件 @
f529675c
...
...
@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1
::
ILayer
*
layer
=
nullptr
;
if
(
engine_
->
with_dynamic_shape
())
{
plugin
::
DynamicPluginTensorRT
*
plugin
=
nullptr
;
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
<
float
>
(
auto
use_fp16
=
engine_
->
WithFp16
()
;
auto
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
(
input_embs
,
bias
,
scale
,
emb_sizes
,
bias_size
,
scale_size
,
hidden
,
eps
);
eps
,
use_fp16
);
layer
=
engine_
->
AddPluginV2
(
input_ids
.
data
(),
input_num
,
plugin
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
...
...
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
浏览文件 @
f529675c
...
...
@@ -32,13 +32,34 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000)
template
<
typename
T
>
int
EmbEltwiseLayernormPluginDynamic
<
T
>::
initialize
()
{
EmbEltwiseLayernormPluginDynamicImpl
<
T
>::~
EmbEltwiseLayernormPluginDynamicImpl
()
{
this
->
terminate
();
}
inline
half
fp32tofp16
(
float
x
)
{
return
static_cast
<
half
>
(
x
);
}
template
<
typename
T
>
int
EmbEltwiseLayernormPluginDynamicImpl
<
T
>::
initialize
()
{
embs_gpu_
.
resize
(
embs_
.
size
());
for
(
int
i
=
0
;
i
<
embs_
.
size
();
i
++
)
{
if
(
embs_
[
i
])
{
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
float
)
*
emb_sizes_
[
i
]);
cudaMemcpy
(
embs_gpu_
[
i
],
embs_
[
i
],
emb_sizes_
[
i
]
*
sizeof
(
float
),
T
*
host_ptr
;
auto
size
=
emb_sizes_
[
i
];
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
host_ptr
=
new
T
[
size
];
std
::
transform
(
embs_
[
i
],
(
embs_
[
i
]
+
size
),
host_ptr
,
fp32tofp16
);
}
else
{
host_ptr
=
reinterpret_cast
<
T
*>
(
embs_
[
i
]);
}
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
T
)
*
size
);
cudaMemcpy
(
embs_gpu_
[
i
],
host_ptr
,
size
*
sizeof
(
T
),
cudaMemcpyHostToDevice
);
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
delete
[]
host_ptr
;
}
}
}
...
...
@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
cudaMemcpyHostToDevice
);
}
int
input_num
=
embs_
.
size
();
in_ptr_tensor_
.
Resize
({
input_num
});
emb_ptr_tensor_
.
Resize
({
input_num
});
cudaGetDevice
(
&
device_id_
);
auto
emb_ptr_gpu_d
=
emb_ptr_tensor_
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id_
));
cudaMemcpy
(
emb_ptr_gpu_d
,
embs_gpu_
.
data
(),
sizeof
(
uintptr_t
)
*
input_num
,
cudaMemcpyHostToDevice
);
return
0
;
}
template
<
typename
T
>
nvinfer1
::
DimsExprs
EmbEltwiseLayernormPluginDynamic
<
T
>::
getOutputDimensions
(
void
EmbEltwiseLayernormPluginDynamicImpl
<
T
>::
terminate
()
{
for
(
int
i
=
0
;
i
<
embs_gpu_
.
size
();
++
i
)
{
if
(
embs_gpu_
[
i
])
{
cudaFree
(
embs_gpu_
[
i
]);
embs_gpu_
[
i
]
=
nullptr
;
}
}
if
(
bias_gpu_
)
{
cudaFree
(
bias_gpu_
);
bias_gpu_
=
nullptr
;
}
if
(
scale_gpu_
)
{
cudaFree
(
scale_gpu_
);
scale_gpu_
=
nullptr
;
}
}
template
<
typename
T
>
int
EmbEltwiseLayernormPluginDynamicImpl
<
T
>::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input_desc
,
const
nvinfer1
::
PluginTensorDesc
*
output_desc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
id_dims
=
input_desc
[
0
].
dims
;
int
batch
=
id_dims
.
d
[
0
];
int
seq_len
=
id_dims
.
d
[
1
];
int
input_num
=
embs_
.
size
();
auto
in_ptr_gpu_d
=
in_ptr_tensor_
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id_
));
auto
emb_ptr_gpu_d
=
emb_ptr_tensor_
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id_
));
auto
new_input_ptr
=
reinterpret_cast
<
uintptr_t
>
(
inputs
[
0
]);
if
(
old_input_ptr_
!=
new_input_ptr
)
{
old_input_ptr_
=
new_input_ptr
;
cudaMemcpyAsync
(
in_ptr_gpu_d
,
reinterpret_cast
<
const
void
*>
(
inputs
),
sizeof
(
uintptr_t
)
*
input_num
,
cudaMemcpyHostToDevice
,
stream
);
}
auto
out_type
=
output_desc
[
0
].
type
;
if
(
std
::
is_same
<
T
,
float
>::
value
)
{
PADDLE_ENFORCE_EQ
(
out_type
==
nvinfer1
::
DataType
::
kFLOAT
,
true
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayernorm Plugin only support fp32 input."
));
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
PADDLE_ENFORCE_EQ
(
out_type
==
nvinfer1
::
DataType
::
kHALF
,
true
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayernorm Plugin only support fp16 input."
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."
));
}
auto
*
output_d
=
reinterpret_cast
<
T
*>
(
outputs
[
0
]);
operators
::
math
::
EmbEltwiseLayerNormFunctor
<
T
>
emb_eltwise_layernorm_func
;
emb_eltwise_layernorm_func
(
batch
,
seq_len
,
hidden_size_
,
in_ptr_gpu_d
,
scale_gpu_
,
bias_gpu_
,
emb_ptr_gpu_d
,
output_d
,
eps_
,
input_num
,
stream
);
return
cudaGetLastError
()
!=
cudaSuccess
;
}
template
class
EmbEltwiseLayernormPluginDynamicImpl
<
float
>;
#ifdef SUPPORTS_CUDA_FP16
template
class
EmbEltwiseLayernormPluginDynamicImpl
<
half
>;
#endif // SUPPORTS_CUDA_FP16
int
EmbEltwiseLayernormPluginDynamic
::
initialize
()
{
impl_
->
initialize
();
return
0
;
}
void
EmbEltwiseLayernormPluginDynamic
::
terminate
()
{
impl_
->
terminate
();
}
nvinfer1
::
DimsExprs
EmbEltwiseLayernormPluginDynamic
::
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
{
// NOLINT
PADDLE_ENFORCE_EQ
(
output_index
,
0
,
...
...
@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return
ret
;
}
template
<
typename
T
>
void
EmbEltwiseLayernormPluginDynamic
<
T
>::
terminate
()
{
for
(
auto
ptr
:
embs_gpu_
)
{
if
(
ptr
)
cudaFree
(
ptr
);
}
if
(
bias_gpu_
)
cudaFree
(
bias_gpu_
);
if
(
scale_gpu_
)
cudaFree
(
scale_gpu_
);
}
template
<
typename
T
>
bool
EmbEltwiseLayernormPluginDynamic
<
T
>::
supportsFormatCombination
(
bool
EmbEltwiseLayernormPluginDynamic
::
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
in_out
,
int
nb_inputs
,
int
nb_outputs
)
{
PADDLE_ENFORCE_NOT_NULL
(
...
...
@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs."
,
nb_outputs
));
PADDLE_ENFORCE_EQ
(
nb_outputs
,
1
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayerNorm's output should be one"
"but it's (%d) outputs."
,
nb_outputs
));
PADDLE_ENFORCE_LT
(
pos
,
nb_inputs
+
nb_outputs
,
platform
::
errors
::
InvalidArgument
(
"The pos(%d) should be less than the "
...
...
@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
}
if
(
pos
==
all_nums
-
1
)
{
if
(
sizeof
(
T
)
==
sizeof
(
float
)
)
{
if
(
with_fp16_
==
false
)
{
return
desc
.
type
==
nvinfer1
::
DataType
::
kFLOAT
;
}
else
{
return
desc
.
type
==
nvinfer1
::
DataType
::
kHALF
;
...
...
@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
return
false
;
}
template
<
typename
T
>
nvinfer1
::
DataType
EmbEltwiseLayernormPluginDynamic
<
T
>::
getOutputDataType
(
nvinfer1
::
DataType
EmbEltwiseLayernormPluginDynamic
::
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
input_types
,
int
nb_inputs
)
const
{
PADDLE_ENFORCE_EQ
(
index
,
0
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayernorm Plugin only has one input, so the "
"index value should be 0, but get %d."
,
index
));
return
nvinfer1
::
DataType
::
kFLOAT
;
if
(
with_fp16_
)
return
nvinfer1
::
DataType
::
kHALF
;
else
return
nvinfer1
::
DataType
::
kFLOAT
;
}
template
<
typename
T
>
int
EmbEltwiseLayernormPluginDynamic
<
T
>::
enqueue
(
int
EmbEltwiseLayernormPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input_desc
,
const
nvinfer1
::
PluginTensorDesc
*
output_desc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
id_dims
=
input_desc
[
0
].
dims
;
int
batch
=
id_dims
.
d
[
0
];
int
seq_len
=
id_dims
.
d
[
1
];
int
input_num
=
embs_
.
size
();
framework
::
Tensor
in_ptr_tensor
,
emb_ptr_tensor
;
int
device_id
;
cudaGetDevice
(
&
device_id
);
in_ptr_tensor
.
Resize
({
input_num
});
emb_ptr_tensor
.
Resize
({
input_num
});
int64_t
*
in_ptr_gpu_d
=
in_ptr_tensor
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id
));
int64_t
*
emb_ptr_gpu_d
=
emb_ptr_tensor
.
mutable_data
<
int64_t
>
(
platform
::
CUDAPlace
(
device_id
));
std
::
vector
<
uintptr_t
>
in_ptr
,
emb_ptr
;
for
(
int
i
=
0
;
i
<
input_num
;
i
++
)
{
in_ptr
.
push_back
(
reinterpret_cast
<
uintptr_t
>
(
inputs
[
i
]));
emb_ptr
.
push_back
(
reinterpret_cast
<
uintptr_t
>
(
embs_gpu_
[
i
]));
}
cudaMemcpyAsync
(
in_ptr_gpu_d
,
in_ptr
.
data
(),
sizeof
(
int64_t
)
*
input_num
,
cudaMemcpyHostToDevice
,
stream
);
cudaMemcpyAsync
(
emb_ptr_gpu_d
,
emb_ptr
.
data
(),
sizeof
(
int64_t
)
*
input_num
,
cudaMemcpyHostToDevice
,
stream
);
auto
out_type
=
output_desc
[
0
].
type
;
const
unsigned
tpb
=
256
;
const
dim3
grid
(
seq_len
,
batch
,
1
);
const
dim3
block
(
tpb
,
1
,
1
);
if
(
sizeof
(
T
)
==
sizeof
(
float
))
{
PADDLE_ENFORCE_EQ
(
out_type
==
nvinfer1
::
DataType
::
kFLOAT
,
true
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayernorm Plugin only support fp32 input."
));
}
else
if
(
sizeof
(
T
)
==
sizeof
(
int16_t
))
{
PADDLE_ENFORCE_EQ
(
out_type
==
nvinfer1
::
DataType
::
kHALF
,
true
,
platform
::
errors
::
InvalidArgument
(
"The EmbEltwiseLayernorm Plugin only support fp16 input."
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupport data type, the out type of EmbEltwiseLayernorm should be "
"float or half."
));
}
T
*
output_d
=
static_cast
<
T
*>
(
outputs
[
0
]);
operators
::
math
::
EmbEltwiseLayerNormFunctor
<
T
>
emb_eltwise_layernorm_func
;
emb_eltwise_layernorm_func
(
batch
,
seq_len
,
hidden_size_
,
in_ptr_gpu_d
,
scale_gpu_
,
bias_gpu_
,
emb_ptr_gpu_d
,
output_d
,
eps_
,
input_num
,
stream
);
impl_
->
enqueue
(
input_desc
,
output_desc
,
inputs
,
outputs
,
workspace
,
stream
);
return
cudaGetLastError
()
!=
cudaSuccess
;
}
template
class
EmbEltwiseLayernormPluginDynamic
<
float
>;
#ifdef SUPPORTS_CUDA_FP16
template
class
EmbEltwiseLayernormPluginDynamic
<
half
>;
#endif // SUPPORTS_CUDA_FP16
#endif
}
// namespace plugin
...
...
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
浏览文件 @
f529675c
...
...
@@ -27,14 +27,76 @@ namespace tensorrt {
namespace
plugin
{
#if IS_TRT_VERSION_GE(6000)
class
EmbEltwiseLayernormPluginDynamicImplBase
{
public:
EmbEltwiseLayernormPluginDynamicImplBase
()
{}
virtual
~
EmbEltwiseLayernormPluginDynamicImplBase
()
{}
virtual
int
initialize
()
=
0
;
virtual
void
terminate
()
=
0
;
virtual
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
=
0
;
};
template
<
typename
T
>
class
EmbEltwiseLayernormPluginDynamicImpl
:
public
EmbEltwiseLayernormPluginDynamicImplBase
{
public:
explicit
EmbEltwiseLayernormPluginDynamicImpl
(
std
::
vector
<
float
*>
input_embs
,
float
*
bias
,
float
*
scale
,
std
::
vector
<
int
>
emb_sizes
,
int
bias_size
,
int
scale_size
,
int
hidden_size
,
float
eps
)
:
embs_
(
input_embs
),
bias_
(
bias
),
scale_
(
scale
),
emb_sizes_
(
emb_sizes
),
bias_size_
(
bias_size
),
scale_size_
(
scale_size
),
hidden_size_
(
hidden_size
),
eps_
(
eps
)
{}
~
EmbEltwiseLayernormPluginDynamicImpl
();
int
initialize
();
void
terminate
();
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
);
private:
std
::
vector
<
float
*>
embs_
;
float
*
bias_
{
nullptr
};
float
*
scale_
{
nullptr
};
// data on devices
float
*
bias_gpu_
{
nullptr
};
float
*
scale_gpu_
{
nullptr
};
std
::
vector
<
T
*>
embs_gpu_
;
std
::
vector
<
int
>
emb_sizes_
;
int
bias_size_
;
int
scale_size_
;
int
hidden_size_
;
float
eps_
;
framework
::
Tensor
in_ptr_tensor_
,
emb_ptr_tensor_
;
int
device_id_
{
0
};
uintptr_t
old_input_ptr_
{
0
};
};
class
EmbEltwiseLayernormPluginDynamic
:
public
DynamicPluginTensorRT
{
public:
explicit
EmbEltwiseLayernormPluginDynamic
(
std
::
vector
<
float
*>
input_embs
,
float
*
bias
,
float
*
scale
,
std
::
vector
<
int
>
emb_sizes
,
int
bias_size
,
int
scale_size
,
int
hidden_size
,
float
eps
)
int
hidden_size
,
float
eps
,
bool
with_fp16
)
:
embs_
(
input_embs
),
bias_
(
bias
),
scale_
(
scale
),
...
...
@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
bias_size_
(
bias_size
),
scale_size_
(
scale_size
),
hidden_size_
(
hidden_size
),
eps_
(
eps
)
{}
eps_
(
eps
),
with_fp16_
(
with_fp16
),
own_host_buff_
(
false
)
{
if
(
with_fp16
)
{
#ifdef SUPPORTS_CUDA_FP16
impl_
=
new
EmbEltwiseLayernormPluginDynamicImpl
<
half
>
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
eps_
);
#else
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupported data type, current GPU doesn't support half."
));
#endif // SUPPORTS_CUDA_FP16
}
else
{
impl_
=
new
EmbEltwiseLayernormPluginDynamicImpl
<
float
>
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
eps_
);
}
}
EmbEltwiseLayernormPluginDynamic
(
void
const
*
serial_data
,
size_t
serial_length
)
{
size_t
serial_length
)
:
own_host_buff_
(
true
)
{
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
emb_sizes_
);
embs_gpu_
.
resize
(
emb_sizes_
.
size
());
embs_
.
resize
(
emb_sizes_
.
size
());
for
(
size_t
i
=
0
;
i
<
emb_sizes_
.
size
();
i
++
)
{
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
float
)
*
emb_sizes_
[
i
]);
cudaMemcpy
(
embs_gpu_
[
i
],
serial_data
,
emb_sizes_
[
i
]
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
auto
size
=
emb_sizes_
[
i
];
auto
ptr
=
new
float
[
size
];
memcpy
(
ptr
,
serial_data
,
sizeof
(
float
)
*
size
);
embs_
[
i
]
=
ptr
;
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
emb_sizes_
[
i
]
*
sizeof
(
float
);
serial_length
-=
emb_sizes_
[
i
]
*
sizeof
(
float
);
embs_
[
i
]
=
nullptr
;
}
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
bias_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
scale_size_
);
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_size_
);
cudaMemcpy
(
bias_gpu_
,
serial_data
,
bias_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
bias_
=
nullptr
;
if
(
bias_size_
)
{
bias_
=
new
float
[
bias_size_
];
memcpy
(
bias_
,
serial_data
,
sizeof
(
float
)
*
bias_size_
);
}
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
bias_size_
*
sizeof
(
float
);
serial_length
-=
bias_size_
*
sizeof
(
float
);
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_size_
);
cudaMemcpy
(
scale_gpu_
,
serial_data
,
scale_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
scale_
=
nullptr
;
if
(
scale_size_
)
{
scale_
=
new
float
[
scale_size_
];
memcpy
(
scale_
,
serial_data
,
sizeof
(
float
)
*
scale_size_
);
}
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
scale_size_
*
sizeof
(
float
);
serial_length
-=
scale_size_
*
sizeof
(
float
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
hidden_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
eps_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
with_fp16_
);
if
(
with_fp16_
)
{
#ifdef SUPPORTS_CUDA_FP16
impl_
=
new
EmbEltwiseLayernormPluginDynamicImpl
<
half
>
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
eps_
);
#else
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupported data type, current GPU doesn't support half."
));
#endif // SUPPORTS_CUDA_FP16
}
else
{
impl_
=
new
EmbEltwiseLayernormPluginDynamicImpl
<
float
>
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
eps_
);
}
}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
override
{
auto
ptr
=
new
EmbEltwiseLayernormPluginDynamic
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
eps_
);
ptr
->
embs_gpu_
=
embs_gpu_
;
ptr
->
bias_gpu_
=
bias_gpu_
;
ptr
->
scale_gpu_
=
scale_gpu_
;
eps_
,
with_fp16_
);
return
ptr
;
}
...
...
@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
}
int
getNbOutputs
()
const
override
{
return
1
;
}
int
initialize
()
override
;
void
terminate
()
override
;
size_t
getSerializationSize
()
const
override
{
int
sum_num
=
0
;
...
...
@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
sum_num
+=
(
bias_size_
+
scale_size_
)
*
sizeof
(
float
);
sum_num
+=
SerializedSize
(
hidden_size_
);
sum_num
+=
SerializedSize
(
eps_
);
//
sum_num += SerializedSize(with_fp16_);
sum_num
+=
SerializedSize
(
with_fp16_
);
return
sum_num
;
}
void
terminate
()
override
;
void
serialize
(
void
*
buffer
)
const
override
{
// SerializeValue(&buffer, with_fp16_);
SerializeValue
(
&
buffer
,
emb_sizes_
);
for
(
size_t
i
=
0
;
i
<
emb_sizes_
.
size
();
i
++
)
{
SerializeCudaPointer
(
&
buffer
,
embs_gpu_
[
i
],
emb_sizes_
[
i
]);
auto
size
=
emb_sizes_
[
i
];
for
(
int
j
=
0
;
j
<
size
;
++
j
)
{
SerializeValue
(
&
buffer
,
embs_
[
i
][
j
]);
}
}
SerializeValue
(
&
buffer
,
bias_size_
);
SerializeValue
(
&
buffer
,
scale_size_
);
SerializeCudaPointer
(
&
buffer
,
bias_gpu_
,
bias_size_
);
SerializeCudaPointer
(
&
buffer
,
scale_gpu_
,
scale_size_
);
for
(
int
i
=
0
;
i
<
bias_size_
;
++
i
)
{
SerializeValue
(
&
buffer
,
bias_
[
i
]);
}
for
(
int
i
=
0
;
i
<
scale_size_
;
++
i
)
{
SerializeValue
(
&
buffer
,
scale_
[
i
]);
}
SerializeValue
(
&
buffer
,
hidden_size_
);
SerializeValue
(
&
buffer
,
eps_
);
SerializeValue
(
&
buffer
,
with_fp16_
);
}
nvinfer1
::
DimsExprs
getOutputDimensions
(
...
...
@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
const
nvinfer1
::
DataType
*
input_types
,
int
nb_inputs
)
const
override
;
void
destroy
()
override
{
delete
this
;
}
void
destroy
()
override
{
if
(
own_host_buff_
)
{
for
(
auto
ptr
:
embs_
)
{
delete
[]
ptr
;
}
delete
[]
bias_
;
delete
[]
scale_
;
}
delete
impl_
;
delete
this
;
}
private:
std
::
vector
<
float
*>
embs_
;
float
*
bias_
;
float
*
scale_
;
// data on devices
float
*
bias_gpu_
;
float
*
scale_gpu_
;
std
::
vector
<
float
*>
embs_gpu_
;
std
::
vector
<
int
>
emb_sizes_
;
int
bias_size_
;
int
scale_size_
;
int
hidden_size_
;
float
eps_
;
bool
with_fp16_
;
bool
own_host_buff_
{
false
};
EmbEltwiseLayernormPluginDynamicImplBase
*
impl_
{
nullptr
};
};
class
EmbEltwiseLayernormPluginV2Creator
:
public
nvinfer1
::
IPluginCreator
{
...
...
@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
{
return
new
EmbEltwiseLayernormPluginDynamic
<
float
>
(
serial_data
,
serial_length
);
return
new
EmbEltwiseLayernormPluginDynamic
(
serial_data
,
serial_length
);
}
void
setPluginNamespace
(
const
char
*
lib_namespace
)
override
{
...
...
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
浏览文件 @
f529675c
...
...
@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run
(
config
,
&
out_data
);
// serialize
run
(
*
config_deser
,
&
out_data
);
// deserialize
for
(
size_t
i
=
0
;
i
<
out_data
.
size
();
i
++
)
{
EXPECT_NEAR
(
result
[
i
],
out_data
[
i
],
1e-
6
);
EXPECT_NEAR
(
result
[
i
],
out_data
[
i
],
1e-
2
);
}
}
...
...
@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
std
::
vector
<
float
>
result
=
{
0.597841
,
0.219972
,
0.182187
};
trt_ernie
(
false
,
result
);
}
TEST
(
AnalysisPredictor
,
fp16
)
{
#ifdef SUPPORTS_CUDA_FP16
std
::
vector
<
float
>
result
=
{
0.598336
,
0.219558
,
0.182106
};
TEST
(
AnalysisPredictor
,
fp16
)
{
std
::
vector
<
float
>
result
=
{
0.59923654
,
0.21923761
,
0.18152587
};
trt_ernie
(
true
,
result
);
#endif
}
#endif // SUPPORTS_CUDA_FP16
}
// namespace inference
}
// namespace paddle
paddle/scripts/paddle_build.sh
浏览文件 @
f529675c
...
...
@@ -121,6 +121,18 @@ function cmake_base() {
else
exit
1
fi
elif
[
"
$1
"
==
"cp38-cp38"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.8"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.8/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.8/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.8/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
pip3.8
install
--user
-r
${
PADDLE_ROOT
}
/python/requirements.txt
else
exit
1
fi
fi
# delete `gym` to avoid modifying requirements.txt in *.whl
sed
-i
.bak
"/^gym
$/
d"
${
PADDLE_ROOT
}
/python/requirements.txt
...
...
@@ -176,6 +188,13 @@ function cmake_base() {
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
pip3.7
install
-r
${
PADDLE_ROOT
}
/python/requirements.txt
elif
[
"
$1
"
==
"cp38-cp38"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.8.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.8.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
pip3.8
install
-r
${
PADDLE_ROOT
}
/python/requirements.txt
fi
else
pip
install
-r
${
PADDLE_ROOT
}
/python/requirements.txt
...
...
@@ -514,6 +533,8 @@ EOF
pip3.6 uninstall
-y
paddlepaddle
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
pip3.7 uninstall
-y
paddlepaddle
elif
[
"
$1
"
==
"cp38-cp38"
]
;
then
pip3.8 uninstall
-y
paddlepaddle
fi
set
-ex
...
...
@@ -527,6 +548,8 @@ EOF
pip3.6
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
pip3.7
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
elif
[
"
$1
"
==
"cp38-cp38"
]
;
then
pip3.8
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
fi
tmpfile_rand
=
`
date
+%s%N
`
tmpfile
=
$tmp_dir
/
$tmpfile_rand
...
...
@@ -666,7 +689,7 @@ function generate_api_spec() {
awk
-F
'('
'{print $NF}'
$spec_path
>
${
spec_path
}
.doc
awk
-F
'('
'{$NF="";print $0}'
$spec_path
>
${
spec_path
}
.api
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
;
then
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
||
[
"
$1
"
==
"cp38-cp38"
]
;
then
# Use sed to make python2 and python3 sepc keeps the same
sed
-i
's/arg0: str/arg0: unicode/g'
$spec_path
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ (ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
$spec_path
...
...
@@ -1244,21 +1267,25 @@ EOF
ref_paddle35
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp35-cp35m-linux_x86_64
.whl
ref_paddle36
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp36-cp36m-linux_x86_64
.whl
ref_paddle37
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp37-cp37m-linux_x86_64
.whl
ref_paddle38
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp38-cp38-linux_x86_64
.whl
ref_paddle2_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp27-cp27mu-linux_x86_64
.whl
ref_paddle35_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp35-cp35m-linux_x86_64
.whl
ref_paddle36_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp36-cp36m-linux_x86_64
.whl
ref_paddle37_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp37-cp37m-linux_x86_64
.whl
ref_paddle38_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
-cp38-cp38-linux_x86_64
.whl
if
[[
${
PADDLE_BRANCH
}
!=
"0.0.0"
&&
${
WITH_MKL
}
==
"ON"
&&
${
WITH_GPU
}
==
"ON"
]]
;
then
ref_paddle2
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp27-cp27mu-linux_x86_64
.whl
ref_paddle35
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp35-cp35m-linux_x86_64
.whl
ref_paddle36
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp36-cp36m-linux_x86_64
.whl
ref_paddle37
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp37-cp37m-linux_x86_64
.whl
ref_paddle38
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp38-cp38-linux_x86_64
.whl
ref_paddle2_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp27-cp27mu-linux_x86_64
.whl
ref_paddle35_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp35-cp35m-linux_x86_64
.whl
ref_paddle36_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp36-cp36m-linux_x86_64
.whl
ref_paddle37_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp37-cp37m-linux_x86_64
.whl
ref_paddle38_whl
=
paddlepaddle
${
install_gpu
}
-
${
PADDLE_BRANCH
}
.post
${
ref_CUDA_MAJOR
}${
CUDNN_MAJOR
}
-cp38-cp38-linux_x86_64
.whl
fi
#ref_paddle2_mv1=""
...
...
@@ -1363,6 +1390,22 @@ EOF
apt-get clean -y &&
\
rm -f
${
ref_paddle37
}
&&
\
ldconfig
EOF
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
# run paddle version to install python packages first
RUN apt-get update &&
${
NCCL_DEPS
}
RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz &&
\
tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 &&
\
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null &&
\
make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig &&
\
pip3.8 install opencv-python && wget
${
ref_web
}
/
${
ref_paddle38
}
&& pip3.8 install
${
ref_paddle38_whl
}
; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f
${
ref_paddle38
}
&&
\
ldconfig
EOF
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
# run paddle version to install python packages first
...
...
python/paddle/distributed/fleet/__init__.py
浏览文件 @
f529675c
...
...
@@ -39,6 +39,7 @@ server_num = fleet.server_num
server_index
=
fleet
.
server_index
server_endpoints
=
fleet
.
server_endpoints
is_server
=
fleet
.
is_server
set_util
=
fleet
.
set_util
util
=
fleet
.
util
barrier_worker
=
fleet
.
barrier_worker
init_worker
=
fleet
.
init_worker
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
f529675c
...
...
@@ -180,6 +180,8 @@ class Fleet(object):
raise
ValueError
(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}"
.
format
(
type
(
role_maker
)))
self
.
_role_maker
.
generate_role
()
self
.
strategy_compiler
=
StrategyCompiler
()
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
():
if
parallel_helper
.
_is_parallel_ctx_initialized
():
...
...
@@ -187,7 +189,6 @@ class Fleet(object):
"The dygraph parallel environment has been initialized."
)
else
:
paddle
.
distributed
.
init_parallel_env
()
return
None
def
is_first_worker
(
self
):
"""
...
...
@@ -275,13 +276,10 @@ class Fleet(object):
fleet.worker_endpoints()
"""
'''
if
to_string
:
return
","
.
join
(
self
.
_role_maker
.
get_trainer_endpoints
())
else
:
return
self
.
_role_maker
.
get_trainer_endpoints
()
'''
return
[
"127.0.0.1:1001"
,
"127.0.0.1:1002"
]
def
server_num
(
self
):
"""
...
...
@@ -355,7 +353,9 @@ class Fleet(object):
return
self
.
_role_maker
.
is_server
(
)
or
self
.
_role_maker
.
_is_heter_worker
()
@
property
def
set_util
(
self
,
util
):
self
.
_util
=
util
def
util
(
self
):
"""
Utility functions that can be used under certain runtime
...
...
@@ -376,16 +376,6 @@ class Fleet(object):
"""
return
self
.
_util
@
util
.
setter
def
util
(
self
,
util
):
"""
Set Utility functions for userd-defined runtime
Returns:
None
"""
self
.
_util
=
util
def
barrier_worker
(
self
):
"""
barrier all workers
...
...
@@ -393,7 +383,7 @@ class Fleet(object):
Returns:
None
"""
self
.
_role_maker
.
barrier_worker
(
)
self
.
_role_maker
.
_barrier
(
"worker"
)
@
is_non_distributed_check
@
inited_runtime_handler
...
...
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
f529675c
此差异已折叠。
点击以展开。
python/paddle/distributed/fleet/base/util_factory.py
浏览文件 @
f529675c
...
...
@@ -57,34 +57,7 @@ class UtilBase(object):
),
"fs_client must be the instance of paddle.distributed.fleet.utils.FS"
self
.
fs_client
=
fs_client
def
__check_comm_world
(
self
,
comm_world
=
"worker"
):
if
not
self
.
role_maker
.
_role_is_generated
:
self
.
role_maker
.
generate_role
()
_comm_world
=
None
comm_world_upper
=
comm_world
.
upper
()
if
comm_world_upper
==
"WORKER"
:
if
not
self
.
role_maker
.
is_worker
():
print
(
"warning: current role is not worker in collective_func(comm_world=
\"
worker
\"
)"
)
_comm_world
=
self
.
role_maker
.
_node_type_comm
elif
comm_world_upper
==
"SERVER"
:
if
not
self
.
role_maker
.
is_server
():
print
(
"warning: current role is not server in collective_func(comm_world=
\"
server
\"
)"
)
_comm_world
=
self
.
role_maker
.
_node_type_comm
elif
comm_world_upper
==
"ALL"
:
_comm_world
=
self
.
role_maker
.
_all_comm
else
:
raise
ValueError
(
"not support comm_world, please choose one from [worker, server, all]"
)
return
_comm_world
def
all_reduce
(
self
,
input
,
mode
,
comm_world
=
"worker"
):
def
all_reduce
(
self
,
input
,
mode
=
"sum"
,
comm_world
=
"worker"
):
"""
All reduce `input` between specified collection. This is a distributed API.
...
...
@@ -130,8 +103,7 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world
=
self
.
__check_comm_world
(
comm_world
)
return
self
.
role_maker
.
_all_reduce
(
_comm_world
,
input
,
mode
)
return
self
.
role_maker
.
_all_reduce
(
input
,
mode
,
comm_world
)
def
barrier
(
self
,
comm_world
=
"worker"
):
"""
...
...
@@ -170,8 +142,7 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world
=
self
.
__check_comm_world
(
comm_world
)
self
.
role_maker
.
_barrier
(
_comm_world
)
self
.
role_maker
.
_barrier
(
comm_world
)
def
all_gather
(
self
,
input
,
comm_world
=
"worker"
):
"""
...
...
@@ -219,8 +190,8 @@ class UtilBase(object):
if __name__ == "__main__":
train()
"""
_comm_world
=
self
.
__check_comm_world
(
comm_world
)
return
self
.
role_maker
.
_all_gather
(
_comm_world
,
input
)
return
self
.
role_maker
.
_all_gather
(
input
,
comm_world
)
def
_broadcast
(
self
):
pass
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
f529675c
...
...
@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
"""
from
__future__
import
print_function
import
shutil
import
sys
import
tempfile
from
sys
import
version
import
subprocess
import
os
...
...
@@ -213,12 +216,20 @@ def launch_collective(args):
cluster
,
pod
=
get_cluster_from_args
(
args
,
gpus
)
logger
.
debug
(
"get cluster from args:{}"
.
format
(
cluster
))
global_envs
=
copy
.
copy
(
os
.
environ
.
copy
())
gloo_rendezvous_dir
=
tempfile
.
mkdtemp
()
# add gloo env
global_envs
[
"PADDLE_WITH_GLOO"
]
=
"1"
global_envs
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"2"
global_envs
[
"PADDLE_GLOO_FS_PATH"
]
=
gloo_rendezvous_dir
procs
=
start_local_trainers
(
cluster
,
pod
,
training_script
=
args
.
training_script
,
training_script_args
=
args
.
training_script_args
,
log_dir
=
args
.
log_dir
)
log_dir
=
args
.
log_dir
,
envs
=
global_envs
)
while
True
:
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_nranks
())
...
...
@@ -230,6 +241,9 @@ def launch_collective(args):
time
.
sleep
(
3
)
if
os
.
path
.
exists
(
gloo_rendezvous_dir
):
shutil
.
rmtree
(
gloo_rendezvous_dir
)
def
launch_ps
(
args
):
ports
=
None
...
...
@@ -315,6 +329,13 @@ def launch_ps(args):
default_env
=
os
.
environ
.
copy
()
current_env
=
copy
.
copy
(
default_env
)
gloo_rendezvous_dir
=
tempfile
.
mkdtemp
()
# add gloo env
current_env
[
"PADDLE_WITH_GLOO"
]
=
"1"
current_env
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"2"
current_env
[
"PADDLE_GLOO_FS_PATH"
]
=
gloo_rendezvous_dir
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
...
...
@@ -419,6 +440,9 @@ def launch_ps(args):
procs
[
i
].
proc
.
terminate
()
print
(
"all parameter server are killed"
,
file
=
sys
.
stderr
)
if
os
.
path
.
exists
(
gloo_rendezvous_dir
):
shutil
.
rmtree
(
gloo_rendezvous_dir
)
def
launch
():
args
=
_parse_args
()
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
f529675c
...
...
@@ -398,8 +398,14 @@ def start_local_trainers(cluster,
pod
,
training_script
,
training_script_args
,
log_dir
=
None
):
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
log_dir
=
None
,
envs
=
None
):
if
envs
is
None
:
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
else
:
current_env
=
copy
.
copy
(
envs
)
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
...
...
python/paddle/fluid/tests/unittests/test_fleet_base.py
浏览文件 @
f529675c
...
...
@@ -27,7 +27,7 @@ class TestFleetBase(unittest.TestCase):
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
\
"127.0.0.1:36001,127.0.0.2:36001"
"127.0.0.1:36001,127.0.0.2:36001"
def
test_init
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
...
...
@@ -88,7 +88,7 @@ class TestFleetBase(unittest.TestCase):
def
test_util
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
self
.
assertEqual
(
fleet
.
util
,
None
)
self
.
assertEqual
(
fleet
.
util
()
,
None
)
def
test_barrier_worker
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
...
...
@@ -99,20 +99,17 @@ class TestFleetBase(unittest.TestCase):
def
test_init_worker
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
if
fleet
.
is_worker
():
fleet
.
init_worker
()
def
test_run_server
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
if
fleet
.
is_worker
():
fleet
.
run_worker
()
with
self
.
assertRaises
(
ValueError
):
if
fleet
.
is_worker
():
fleet
.
init_worker
()
def
test_stop_worker
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
if
fleet
.
is_worker
():
fleet
.
stop_worker
()
with
self
.
assertRaises
(
ValueError
):
if
fleet
.
is_worker
():
fleet
.
stop_worker
()
def
test_distributed_optimizer
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
浏览文件 @
f529675c
...
...
@@ -15,7 +15,11 @@
from
__future__
import
print_function
import
os
import
platform
import
shutil
import
tempfile
import
unittest
import
paddle
import
paddle.distributed.fleet.base.role_maker
as
role_maker
...
...
@@ -42,9 +46,9 @@ class TestRoleMakerBase(unittest.TestCase):
self
.
assertTrue
(
len
(
pserver_endpoints
)
==
0
)
print
(
role
.
to_string
())
self
.
assertTrue
(
role
.
_all_gather
(
role
.
_node_type_comm
,
1
)
is
None
)
self
.
assertTrue
(
role
.
_all_reduce
(
role
.
_node_type_comm
,
1
)
is
None
)
role
.
_barrier
(
role
.
_node_type_comm
)
self
.
assertTrue
(
role
.
_all_gather
(
1
,
"worker"
)
is
None
)
self
.
assertTrue
(
role
.
_all_reduce
(
1
,
"sum"
,
"worker"
)
is
None
)
role
.
_barrier
(
"worker"
)
class
TestCloudRoleMaker
(
unittest
.
TestCase
):
...
...
@@ -72,8 +76,8 @@ class TestCloudRoleMaker(unittest.TestCase):
print
(
"warning: no netifaces, skip test_tr_rolemaker"
)
return
ro
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
)
ro
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
False
)
self
.
assertTrue
(
ro
.
is_worker
())
self
.
assertFalse
(
ro
.
is_server
())
self
.
assertEqual
(
ro
.
worker_num
(),
2
)
...
...
@@ -108,8 +112,9 @@ class TestCloudRoleMaker(unittest.TestCase):
self
.
assertEqual
(
ro
.
server_num
(),
2
)
pserver_endpoints
=
ro
.
get_pserver_endpoints
()
self
.
assertEqual
(
pserver_endpoints
[
0
],
'127.0.0.1:36001'
)
self
.
assertTrue
(
ro
.
_all_gather
(
ro
.
_all_comm
,
1
)
is
None
)
self
.
assertTrue
(
ro
.
_all_reduce
(
ro
.
_all_comm
,
1
)
is
None
)
self
.
assertEqual
(
ro
.
_all_gather
(
1
,
"worker"
),
1
)
self
.
assertEqual
(
ro
.
_all_reduce
(
1
,
"sum"
,
"worker"
),
1
)
def
test_traing_role
(
self
):
"""Test training role."""
...
...
@@ -142,7 +147,7 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
ro
=
role_maker
.
UserDefinedRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
,
server_endpoints
=
"127.0.0.1:36001,127.0.0.1:36001"
,
server_endpoints
=
[
"127.0.0.1:36001"
,
"127.0.0.1:36001"
]
,
role
=
role_maker
.
Role
.
SERVER
,
current_id
=
0
,
worker_num
=
2
)
...
...
@@ -161,14 +166,274 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
ro
=
role_maker
.
UserDefinedRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
,
server_endpoints
=
"127.0.0.1:36001,127.0.0.1:36001"
,
server_endpoints
=
[
"127.0.0.1:36001"
,
"127.0.0.1:36001"
]
,
role
=
role_maker
.
Role
.
WORKER
,
current_id
=
0
,
worker_num
=
2
)
self
.
assertIn
(
"127.0.0.1:36001"
,
ro
.
get_pserver_endpoints
())
self
.
assertTrue
(
ro
.
is_worker
())
self
.
assertEqual
(
ro
.
role_id
(),
0
)
class
TestGlooWithCloudRoleMaker
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"1"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
def
case
(
self
,
role
,
comm_world
):
role
.
_barrier
(
comm_world
)
gather
=
role
.
_all_gather
(
1
,
comm_world
)
self
.
assertEqual
(
gather
[
0
],
1
)
all_reduce
=
role
.
_all_reduce
(
1
,
"sum"
,
comm_world
)
self
.
assertEqual
(
1
,
all_reduce
)
def
mkdir
(
self
):
tmp
=
tempfile
.
mkdtemp
()
return
tmp
def
clean
(
self
,
tmp
):
shutil
.
rmtree
(
tmp
)
def
test_hdfs_gloo
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_FS_NAME"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_UGI"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"worker"
)
self
.
clean
(
tmp
)
def
test_fs_gloo
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"worker"
)
self
.
clean
(
tmp
)
def
test_fs_gloo2
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"server"
)
self
.
clean
(
tmp
)
def
test_fs_gloo3
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_FS_NAME"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_UGI"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"server"
)
self
.
clean
(
tmp
)
def
test_fs_gloo4
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"3"
os
.
environ
[
"PADDLE_GLOO_HTTP_HOST"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_GLOO_HTTP_PORT"
]
=
"30019"
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
import
time
time
.
sleep
(
3
)
def
test_fs_gloo5
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"0"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"server"
)
self
.
case
(
role
,
"all"
)
self
.
clean
(
tmp
)
def
test_fs_gloo6
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"0"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_FS_NAME"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_UGI"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
role
=
role_maker
.
PaddleCloudRoleMaker
()
role
.
generate_role
()
self
.
case
(
role
,
"server"
)
self
.
case
(
role
,
"all"
)
self
.
clean
(
tmp
)
def
test_fs_gloo7
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"0"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"5"
role
=
role_maker
.
PaddleCloudRoleMaker
()
self
.
assertRaises
(
ValueError
,
role
.
generate_role
)
def
test_fs_gloo8
(
self
):
plats
=
platform
.
platform
()
if
'Linux'
not
in
plats
:
print
(
"skip gloo UT on MacOS/Win"
)
return
tmp
=
self
.
mkdir
()
os
.
environ
[
"TRAINING_ROLE"
]
=
"PSERVER"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
"127.0.0.1:36001"
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"0"
os
.
environ
[
"SYS_JOB_ID"
]
=
"gloo_for_cluster"
os
.
environ
[
"PADDLE_WITH_GLOO"
]
=
"2"
os
.
environ
[
"PADDLE_GLOO_RENDEZVOUS"
]
=
"1"
os
.
environ
[
"PADDLE_GLOO_FS_NAME"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_UGI"
]
=
"NULL"
os
.
environ
[
"PADDLE_GLOO_FS_PATH"
]
=
tmp
def
net
():
x
=
paddle
.
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
13
],
dtype
=
'float32'
)
y_predict
=
paddle
.
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
)
y
=
paddle
.
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
paddle
.
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
cost
)
return
avg_cost
from
paddle.distributed
import
fleet
role
=
role_maker
.
PaddleCloudRoleMaker
()
fleet
.
init
(
role
)
avg_cost
=
net
()
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
a_sync
=
False
optimizer
=
paddle
.
optimizer
.
SGD
(
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
comm_world
=
"server"
fleet
.
util
().
barrier
(
comm_world
)
gather
=
fleet
.
util
().
all_gather
(
1
,
comm_world
)
self
.
assertEqual
(
gather
[
0
],
1
)
all_reduce
=
fleet
.
util
().
all_reduce
(
1
,
"sum"
,
comm_world
)
self
.
assertEqual
(
1
,
all_reduce
)
self
.
clean
(
tmp
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_util.py
浏览文件 @
f529675c
...
...
@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase):
import
paddle.distributed.fleet.base.role_maker
as
role_maker
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
default_util
=
fleet
.
util
default_util
=
fleet
.
util
()
self
.
assertEqual
(
default_util
,
None
)
def
test_set_user_defined_util
(
self
):
...
...
@@ -76,8 +76,8 @@ class TestFleetUtil(unittest.TestCase):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
my_util
=
UserDefinedUtil
()
fleet
.
util
=
my_util
user_id
=
fleet
.
util
.
get_user_id
()
fleet
.
set_util
(
my_util
)
user_id
=
fleet
.
util
()
.
get_user_id
()
self
.
assertEqual
(
user_id
,
10
)
def
test_fs
(
self
):
...
...
@@ -88,97 +88,6 @@ class TestFleetUtil(unittest.TestCase):
self
.
assertFalse
(
fs
.
need_upload_download
())
fleet_util
.
_set_file_system
(
fs
)
def
test_barrier
(
self
):
try
:
import
netifaces
except
:
print
(
"warning: no netifaces, skip test_barrier"
)
return
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
set_rank
(
0
)
gloo
.
set_size
(
1
)
gloo
.
set_prefix
(
"123"
)
gloo
.
set_iface
(
"lo"
)
gloo
.
set_hdfs_store
(
"./tmp_test_fleet_barrier"
,
""
,
""
)
gloo
.
init
()
role
=
role_maker
.
UserDefinedRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
,
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_endpoints
=
[
"127.0.0.1:6003"
],
server_endpoints
=
[
"127.0.0.1:6001"
])
role
.
_node_type_comm
=
gloo
role
.
_role_is_generated
=
True
fleet_util
.
_set_role_maker
(
role
)
fleet_util
.
barrier
(
"worker"
)
def
test_all_reduce
(
self
):
try
:
import
netifaces
except
:
print
(
"warning: no netifaces, skip test_all_reduce"
)
return
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
set_rank
(
0
)
gloo
.
set_size
(
1
)
gloo
.
set_prefix
(
"123"
)
gloo
.
set_iface
(
"lo"
)
gloo
.
set_hdfs_store
(
"./tmp_test_fleet_reduce"
,
""
,
""
)
gloo
.
init
()
role
=
role_maker
.
UserDefinedRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
,
current_id
=
0
,
role
=
role_maker
.
Role
.
WORKER
,
worker_endpoints
=
[
"127.0.0.1:6003"
],
server_endpoints
=
[
"127.0.0.1:6001"
])
role
.
_node_type_comm
=
gloo
role
.
_role_is_generated
=
True
fleet_util
.
_set_role_maker
(
role
)
output
=
fleet_util
.
all_reduce
(
1
,
"sum"
,
comm_world
=
"server"
)
print
(
output
)
# self.assertEqual(output, 1)
def
test_all_gather
(
self
):
try
:
import
netifaces
except
:
print
(
"warning: no netifaces, skip test_all_gather"
)
return
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
set_rank
(
0
)
gloo
.
set_size
(
1
)
gloo
.
set_prefix
(
"123"
)
gloo
.
set_iface
(
"lo"
)
gloo
.
set_hdfs_store
(
"./tmp_test_fleet_reduce"
,
""
,
""
)
gloo
.
init
()
role
=
role_maker
.
UserDefinedRoleMaker
(
is_collective
=
False
,
init_gloo
=
False
,
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_endpoints
=
[
"127.0.0.1:6003"
],
server_endpoints
=
[
"127.0.0.1:6001"
])
role
.
_node_type_comm
=
gloo
role
.
_all_comm
=
gloo
role
.
_role_is_generated
=
True
fleet_util
.
_set_role_maker
(
role
)
output
=
fleet_util
.
all_gather
(
1
,
comm_world
=
"all"
)
print
(
output
)
# self.assertTrue(len(output) == 1 and output[0] == 1)
self
.
assertRaises
(
Exception
,
fleet_util
.
all_gather
,
1
,
"test"
)
def
download_files
(
self
):
path
=
download
(
self
.
proto_data_url
,
self
.
module_name
,
self
.
proto_data_md5
)
...
...
python/paddle/fluid/tests/unittests/test_transformer_api.py
浏览文件 @
f529675c
...
...
@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
trans_output
=
transformer
(
src
,
tgt
,
src_mask
,
tgt_mask
,
memory_mask
)
def
test_transformer_attr_1
(
self
):
batch_size
,
d_model
,
n_head
,
dim_feedforward
,
dropout
,
_
,
_
,
source_length
,
target_length
=
generate_basic_params
(
mode
=
"decoder_layer"
)
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with
fluid
.
dygraph
.
guard
(
fluid
.
CPUPlace
()):
transformer
=
Transformer
(
d_model
,
n_head
,
dim_feedforward
=
dim_feedforward
,
dropout
=
dropout
,
weight_attr
=
[
None
],
bias_attr
=
[
False
])
src
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
source_length
,
d_model
).
astype
(
"float32"
))
tgt
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
target_length
,
d_model
).
astype
(
"float32"
))
src_mask
=
np
.
zeros
((
batch_size
,
n_head
,
source_length
,
source_length
)).
astype
(
"float32"
)
src_mask
[
0
][
0
][
0
][
0
]
=
-
np
.
inf
src_mask
=
paddle
.
to_variable
(
src_mask
)
tgt_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
target_length
)).
astype
(
"float32"
)
tgt_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
memory_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
source_length
)).
astype
(
"float32"
)
memory_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
tgt_mask
,
memory_mask
=
paddle
.
to_variable
(
tgt_mask
),
paddle
.
to_variable
(
memory_mask
)
trans_output
=
transformer
(
src
,
tgt
,
src_mask
,
tgt_mask
,
memory_mask
)
def
test_transformer_attr_2
(
self
):
batch_size
,
d_model
,
n_head
,
dim_feedforward
,
dropout
,
_
,
_
,
source_length
,
target_length
=
generate_basic_params
(
mode
=
"decoder_layer"
)
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with
fluid
.
dygraph
.
guard
(
fluid
.
CPUPlace
()):
transformer
=
Transformer
(
d_model
,
n_head
,
dim_feedforward
=
dim_feedforward
,
dropout
=
dropout
,
weight_attr
=
[
None
,
None
],
bias_attr
=
[
False
,
False
])
src
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
source_length
,
d_model
).
astype
(
"float32"
))
tgt
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
target_length
,
d_model
).
astype
(
"float32"
))
src_mask
=
np
.
zeros
((
batch_size
,
n_head
,
source_length
,
source_length
)).
astype
(
"float32"
)
src_mask
[
0
][
0
][
0
][
0
]
=
-
np
.
inf
src_mask
=
paddle
.
to_variable
(
src_mask
)
tgt_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
target_length
)).
astype
(
"float32"
)
tgt_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
memory_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
source_length
)).
astype
(
"float32"
)
memory_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
tgt_mask
,
memory_mask
=
paddle
.
to_variable
(
tgt_mask
),
paddle
.
to_variable
(
memory_mask
)
trans_output
=
transformer
(
src
,
tgt
,
src_mask
,
tgt_mask
,
memory_mask
)
def
test_transformer_attr_3
(
self
):
batch_size
,
d_model
,
n_head
,
dim_feedforward
,
dropout
,
_
,
_
,
source_length
,
target_length
=
generate_basic_params
(
mode
=
"decoder_layer"
)
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with
fluid
.
dygraph
.
guard
(
fluid
.
CPUPlace
()):
transformer
=
Transformer
(
d_model
,
n_head
,
dim_feedforward
=
dim_feedforward
,
dropout
=
dropout
,
weight_attr
=
[
None
,
None
,
None
],
bias_attr
=
[
False
,
False
,
True
])
src
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
source_length
,
d_model
).
astype
(
"float32"
))
tgt
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
target_length
,
d_model
).
astype
(
"float32"
))
src_mask
=
np
.
zeros
((
batch_size
,
n_head
,
source_length
,
source_length
)).
astype
(
"float32"
)
src_mask
[
0
][
0
][
0
][
0
]
=
-
np
.
inf
src_mask
=
paddle
.
to_variable
(
src_mask
)
tgt_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
target_length
)).
astype
(
"float32"
)
tgt_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
memory_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
source_length
)).
astype
(
"float32"
)
memory_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
tgt_mask
,
memory_mask
=
paddle
.
to_variable
(
tgt_mask
),
paddle
.
to_variable
(
memory_mask
)
trans_output
=
transformer
(
src
,
tgt
,
src_mask
,
tgt_mask
,
memory_mask
)
def
test_transformer_attr_boolean
(
self
):
batch_size
,
d_model
,
n_head
,
dim_feedforward
,
dropout
,
_
,
_
,
source_length
,
target_length
=
generate_basic_params
(
mode
=
"decoder_layer"
)
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with
fluid
.
dygraph
.
guard
(
fluid
.
CPUPlace
()):
transformer
=
Transformer
(
d_model
,
n_head
,
dim_feedforward
=
dim_feedforward
,
dropout
=
dropout
,
bias_attr
=
False
)
src
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
source_length
,
d_model
).
astype
(
"float32"
))
tgt
=
paddle
.
to_variable
(
np
.
random
.
rand
(
batch_size
,
target_length
,
d_model
).
astype
(
"float32"
))
src_mask
=
np
.
zeros
((
batch_size
,
n_head
,
source_length
,
source_length
)).
astype
(
"float32"
)
src_mask
[
0
][
0
][
0
][
0
]
=
-
np
.
inf
src_mask
=
paddle
.
to_variable
(
src_mask
)
tgt_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
target_length
)).
astype
(
"float32"
)
tgt_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
memory_mask
=
np
.
zeros
((
batch_size
,
n_head
,
target_length
,
source_length
)).
astype
(
"float32"
)
memory_mask
[
0
][
0
][
0
][
0
]
=
-
1e9
tgt_mask
,
memory_mask
=
paddle
.
to_variable
(
tgt_mask
),
paddle
.
to_variable
(
memory_mask
)
trans_output
=
transformer
(
src
,
tgt
,
src_mask
,
tgt_mask
,
memory_mask
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/nn/layer/transformer.py
浏览文件 @
f529675c
...
...
@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
if
isinstance
(
param_attr
,
(
list
,
tuple
)):
assert
len
(
param_attr
)
==
n
,
(
"length of param_attr should be %d when it is a list/tuple"
%
n
)
param_attrs
=
[
ParamAttr
.
_to_attr
(
attr
)
for
attr
in
param_attr
]
param_attrs
=
[]
for
attr
in
param_attr
:
if
isinstance
(
attr
,
bool
):
if
attr
:
param_attrs
.
append
(
ParamAttr
.
_to_attr
(
None
))
else
:
param_attrs
.
append
(
False
)
else
:
param_attrs
.
append
(
ParamAttr
.
_to_attr
(
attr
))
# param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
elif
isinstance
(
param_attr
,
bool
):
param_attrs
=
[]
if
param_attr
:
param_attrs
=
[
ParamAttr
.
_to_attr
(
None
)
for
i
in
range
(
n
)]
else
:
param_attrs
=
[
False
]
*
n
else
:
param_attrs
=
[]
attr
=
ParamAttr
.
_to_attr
(
param_attr
)
...
...
@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used.
See usage for details in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
bias_attr (ParamAttr|tuple
|bool
, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
...
...
@@ -986,22 +1001,31 @@ class Transformer(Layer):
Otherwise, no pre-process and post-precess includes dropout, residual
connection, layer normalization. Default False
weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
self attention, `weight_attr[1]` would be used as `weight_attr` for
cross attention, and `weight_attr[2]` would be used as `weight_attr`
for linear in FFN. Otherwise, the three sub-layers all uses it as
`weight_attr` to create parameters. Default: None, which means the
default weight parameter property is used. See usage for details
If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
`weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
would be used as `weight_attr` for cross attention of `TransformerDecoder`,
and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used.
See usage for details
in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
self attention, `bias_attr[1]` would be used as `bias_attr` for
cross attention, and `bias_attr[2]` would be used as `bias_attr`
for linear in FFN. Otherwise, the three sub-layers all uses it as
`bias_attr` to create parameters. The `False` value means the
corresponding layer would not have trainable bias parameter. See
usage for details in :code:`ParamAttr` . Default: None,which means
the default bias parameter property is used.
If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
`bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
would be used as `bias_attr` for cross attention of `TransformerDecoder`,
and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `bias_attr` to create parameters.
The `False` value means the corresponding layer would not have trainable
bias parameter. See usage for details in :code:`ParamAttr` .
Default: None,which means the default bias parameter property is used.
custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
Default None
custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
...
...
@@ -1049,13 +1073,51 @@ class Transformer(Layer):
custom_decoder
=
None
):
super
(
Transformer
,
self
).
__init__
()
if
isinstance
(
bias_attr
,
(
list
,
tuple
)):
if
len
(
bias_attr
)
==
1
:
encoder_bias_attr
=
[
bias_attr
[
0
]]
*
2
decoder_bias_attr
=
[
bias_attr
[
0
]]
*
3
elif
len
(
bias_attr
)
==
2
:
encoder_bias_attr
=
bias_attr
decoder_bias_attr
=
[
bias_attr
[
0
],
bias_attr
[
0
],
bias_attr
[
-
1
]]
elif
len
(
bias_attr
)
==
3
:
encoder_bias_attr
=
[
bias_attr
[
0
],
bias_attr
[
-
1
]]
decoder_bias_attr
=
bias_attr
else
:
assert
False
,
(
"length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else
:
encoder_bias_attr
=
bias_attr
decoder_bias_attr
=
bias_attr
if
isinstance
(
weight_attr
,
(
list
,
tuple
)):
if
len
(
weight_attr
)
==
1
:
encoder_weight_attr
=
[
weight_attr
[
0
]]
*
2
decoder_weight_attr
=
[
weight_attr
[
0
]]
*
3
elif
len
(
weight_attr
)
==
2
:
encoder_weight_attr
=
weight_attr
decoder_weight_attr
=
[
weight_attr
[
0
],
weight_attr
[
0
],
weight_attr
[
-
1
]
]
elif
len
(
weight_attr
)
==
3
:
encoder_weight_attr
=
[
weight_attr
[
0
],
weight_attr
[
-
1
]]
decoder_weight_attr
=
weight_attr
else
:
assert
False
,
(
"length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else
:
encoder_weight_attr
=
weight_attr
decoder_weight_attr
=
weight_attr
if
custom_encoder
is
not
None
:
self
.
encoder
=
custom_encoder
else
:
encoder_layer
=
TransformerEncoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
,
weight_attr
,
bias_attr
)
attn_dropout
,
act_dropout
,
normalize_before
,
encoder_weight_attr
,
encoder_
bias_attr
)
encoder_norm
=
LayerNorm
(
d_model
)
self
.
encoder
=
TransformerEncoder
(
encoder_layer
,
num_encoder_layers
,
encoder_norm
)
...
...
@@ -1065,8 +1127,8 @@ class Transformer(Layer):
else
:
decoder_layer
=
TransformerDecoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
dropout
,
activation
,
attn_dropout
,
act_dropout
,
normalize_before
,
weight_attr
,
bias_attr
)
attn_dropout
,
act_dropout
,
normalize_before
,
decoder_weight_attr
,
decoder_
bias_attr
)
decoder_norm
=
LayerNorm
(
d_model
)
self
.
decoder
=
TransformerDecoder
(
decoder_layer
,
num_decoder_layers
,
decoder_norm
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录