Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e7547ca7
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e7547ca7
编写于
8月 02, 2022
作者:
Y
Yuang Liu
提交者:
GitHub
8月 02, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Pass NVIDIA_TF32_OVERRIDE to internal (#43646) (#44796)
Co-authored-by:
N
gongweibao
<
gongweibao@baidu.com
>
上级
6de20581
变更
2
展开全部
隐藏空白更改
内联
并排
Showing
2 changed file
with
204 addition
and
208 deletion
+204
-208
python/paddle/fluid/tests/unittests/test_collective_api_base.py
.../paddle/fluid/tests/unittests/test_collective_api_base.py
+54
-63
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+150
-145
未找到文件。
python/paddle/fluid/tests/unittests/test_collective_api_base.py
浏览文件 @
e7547ca7
...
...
@@ -32,6 +32,7 @@ from paddle.fluid import core
class
TestCollectiveAPIRunnerBase
(
object
):
def
get_model
(
self
,
train_prog
,
startup_prog
,
rank
,
indata
=
None
):
raise
NotImplementedError
(
"get model should be implemented by child class."
)
...
...
@@ -91,6 +92,7 @@ from contextlib import closing
class
TestDistBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_port_set
=
set
()
self
.
_trainers
=
2
...
...
@@ -104,6 +106,7 @@ class TestDistBase(unittest.TestCase):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
...
...
@@ -168,17 +171,15 @@ class TestDistBase(unittest.TestCase):
tr0_pipe
=
open
(
path0
,
"w"
)
tr1_pipe
=
open
(
path1
,
"w"
)
#print(tr0_cmd)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr0_pipe
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr1_pipe
,
env
=
env1
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr0_pipe
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr1_pipe
,
env
=
env1
)
tr0_out
,
tr0_err
=
tr0_proc
.
communicate
()
tr1_out
,
tr1_err
=
tr1_proc
.
communicate
()
...
...
@@ -220,8 +221,14 @@ class TestDistBase(unittest.TestCase):
required_envs
[
"GLOG_v"
]
=
"3"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
required_envs
[
"GLOO_LOG_LEVEL"
]
=
"TRACE"
tr0_out
,
tr1_out
,
pid0
,
pid1
=
self
.
_run_cluster
(
model_file
,
required_envs
)
if
os
.
getenv
(
'NVIDIA_TF32_OVERRIDE'
,
''
)
is
not
None
:
required_envs
[
'NVIDIA_TF32_OVERRIDE'
]
=
os
.
getenv
(
'NVIDIA_TF32_OVERRIDE'
,
''
)
tr0_out
,
tr1_out
,
pid0
,
pid1
=
self
.
_run_cluster
(
model_file
,
required_envs
)
np
.
random
.
seed
(
pid0
)
input1
=
np
.
random
.
random
((
10
,
1000
))
np
.
random
.
seed
(
pid1
)
...
...
@@ -248,11 +255,9 @@ class TestDistBase(unittest.TestCase):
elif
col_type
==
"allreduce"
:
need_result
=
input1
+
input2
self
.
assertTrue
(
np
.
allclose
(
tr0_out
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"parallel_embedding"
:
result_data
=
tr0_out
[
0
]
np
.
random
.
seed
(
2020
)
...
...
@@ -260,24 +265,23 @@ class TestDistBase(unittest.TestCase):
for
i
in
range
(
result_data
.
shape
[
0
]):
for
j
in
range
(
result_data
.
shape
[
1
]):
data
=
result_data
[
i
][
j
]
assert
np
.
allclose
(
tr0_out
[
1
][
i
][
j
],
need_result
[
data
],
atol
=
1e-08
)
assert
np
.
allclose
(
tr0_out
[
1
][
i
][
j
],
need_result
[
data
],
atol
=
1e-08
)
elif
col_type
==
"row_parallel_linear"
:
result_data
=
tr0_out
[
0
]
np
.
random
.
seed
(
2020
)
weight
=
np
.
random
.
rand
(
1000
,
16
)
need_result
=
np
.
matmul
(
input1
,
weight
)
self
.
assertTrue
(
np
.
allclose
(
result_data
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
result_data
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"column_parallel_linear"
:
result_data
=
tr0_out
[
0
]
np
.
random
.
seed
(
2020
)
weight
=
np
.
random
.
rand
(
1000
,
16
)
need_result
=
np
.
matmul
(
input1
,
weight
)
self
.
assertTrue
(
np
.
allclose
(
result_data
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
result_data
,
need_result
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"alltoall"
:
need_result1
=
np
.
vstack
((
input1
[
0
:
input1
.
shape
[
0
]
//
2
,
:],
input2
[
0
:
input2
.
shape
[
0
]
//
2
,
:]))
...
...
@@ -286,16 +290,13 @@ class TestDistBase(unittest.TestCase):
tr0_out
=
np
.
vstack
(
tr0_out
)
tr1_out
=
np
.
vstack
(
tr1_out
)
self
.
assertTrue
(
np
.
allclose
(
tr0_out
,
need_result1
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
,
need_result1
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
,
need_result2
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
,
need_result2
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"sendrecv"
:
result_data
=
tr1_out
[
0
]
self
.
assertTrue
(
np
.
allclose
(
input1
,
result_data
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
input1
,
result_data
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"global_gather"
:
in_feat
=
2
n_expert
=
2
...
...
@@ -372,15 +373,13 @@ class TestDistBase(unittest.TestCase):
if
result1
==
[]:
output1
=
np
.
array
([])
else
:
output1
=
np
.
concatenate
(
result1
,
axis
=
0
).
reshape
(
sum
(
local_expert_count1
),
in_feat
)
output1
=
np
.
concatenate
(
result1
,
axis
=
0
).
reshape
(
sum
(
local_expert_count1
),
in_feat
)
if
result2
==
[]:
output2
=
np
.
array
([])
else
:
output2
=
np
.
concatenate
(
result2
,
axis
=
0
).
reshape
(
sum
(
local_expert_count2
),
in_feat
)
output2
=
np
.
concatenate
(
result2
,
axis
=
0
).
reshape
(
sum
(
local_expert_count2
),
in_feat
)
if
tr0_out
[
0
]
is
None
or
tr0_out
[
0
].
shape
[
0
]
==
0
:
tr0_out
[
0
]
=
np
.
array
([])
...
...
@@ -389,24 +388,20 @@ class TestDistBase(unittest.TestCase):
tr1_out
[
0
]
=
np
.
array
([])
self
.
assertTrue
(
np
.
allclose
(
tr0_out
[
0
],
output1
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
[
0
],
output1
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
[
0
],
output2
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
[
0
],
output2
,
rtol
=
1e-05
,
atol
=
1e-05
))
if
static_mode
==
0
:
self
.
assertTrue
(
np
.
allclose
(
tr0_out
[
1
],
2
*
local_input_buf1
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
[
1
],
2
*
local_input_buf1
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
[
1
],
2
*
local_input_buf2
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
[
1
],
2
*
local_input_buf2
,
rtol
=
1e-05
,
atol
=
1e-05
))
elif
col_type
==
"global_scatter"
:
np
.
random
.
seed
(
pid0
)
...
...
@@ -460,23 +455,19 @@ class TestDistBase(unittest.TestCase):
tr1_out
[
0
]
=
np
.
array
([])
self
.
assertTrue
(
np
.
allclose
(
tr0_out
[
0
],
output1
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
[
0
],
output1
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
[
0
],
output2
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
[
0
],
output2
,
rtol
=
1e-05
,
atol
=
1e-05
))
if
static_mode
==
0
:
self
.
assertTrue
(
np
.
allclose
(
tr0_out
[
1
],
2
*
local_input_buf1
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr0_out
[
1
],
2
*
local_input_buf1
,
rtol
=
1e-05
,
atol
=
1e-05
))
self
.
assertTrue
(
np
.
allclose
(
tr1_out
[
1
],
2
*
local_input_buf2
,
rtol
=
1e-05
,
atol
=
1e-05
))
np
.
allclose
(
tr1_out
[
1
],
2
*
local_input_buf2
,
rtol
=
1e-05
,
atol
=
1e-05
))
else
:
pass
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
e7547ca7
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录