Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
e0bb8cce
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e0bb8cce
编写于
3月 22, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
fix conflict
上级
f79a3a83
e61d7245
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
196 addition
and
0 deletion
+196
-0
python/paddle/distributed/__init__.py
python/paddle/distributed/__init__.py
+13
-0
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+135
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+48
-0
未找到文件。
python/paddle/distributed/__init__.py
0 → 100644
浏览文件 @
e0bb8cce
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python/paddle/distributed/launch.py
0 → 100644
浏览文件 @
e0bb8cce
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
subprocess
import
os
import
sys
import
time
import
argparse
default_envs
=
{
"PADDLE_TRAINER_ENDPOINTS"
:
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177"
,
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"PATH"
:
os
.
getenv
(
"PATH"
),
"LD_PRELOAD"
:
os
.
getenv
(
"LD_PRELOAD"
,
""
),
"PADDLE_TRAINERS_NUM"
:
"8"
,
"NCCL_DEBUG"
:
"INFO"
,
"GLOG_v"
:
"0"
,
"NCCL_SOCKET_IFNAME"
:
"eth0"
,
"NCCL_IB_GID_INDEX"
:
"3"
,
"NCCL_IB_RETRY_CNT"
:
"0"
,
}
GPUS
=
8
def
start_procs
(
gpus
,
entrypoint
,
entrypoint_args
,
log_dir
):
procs
=
[]
log_fns
=
[]
os
.
system
(
"mkdir -p %s"
%
log_dir
)
# ======== update parent envs =======
for
k
,
v
in
os
.
environ
.
items
():
if
k
.
startswith
(
"FLAGS_"
)
or
k
.
startswith
(
"NCCL_"
)
or
\
k
.
startswith
(
"GLOG_"
):
default_envs
[
k
]
=
v
# ======== for dist training =======
node_trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"0"
))
current_ip
=
os
.
getenv
(
"POD_IP"
,
"127.0.0.1"
)
trainer_ips
=
os
.
getenv
(
"PADDLE_TRAINERS"
,
current_ip
).
split
(
","
)
num_nodes
=
len
(
trainer_ips
)
all_nodes_devices_endpoints
=
""
for
n
in
trainer_ips
:
for
i
in
range
(
gpus
):
if
all_nodes_devices_endpoints
:
all_nodes_devices_endpoints
+=
","
all_nodes_devices_endpoints
+=
"%s:617%d"
%
(
n
,
i
)
nranks
=
num_nodes
*
gpus
# ======== for dist training =======
for
i
in
range
(
gpus
):
curr_env
=
{}
curr_env
.
update
(
default_envs
)
curr_env
.
update
({
"FLAGS_selected_gpus"
:
"%d"
%
i
,
"PADDLE_TRAINER_ID"
:
"%d"
%
(
node_trainer_id
*
gpus
+
i
),
"PADDLE_CURRENT_ENDPOINT"
:
"%s:617%d"
%
(
current_ip
,
i
),
# nranks
"PADDLE_TRAINERS_NUM"
:
"%d"
%
nranks
,
"PADDLE_TRAINER_ENDPOINTS"
:
all_nodes_devices_endpoints
})
print
(
"starting process "
,
i
,
entrypoint
,
entrypoint_args
,
curr_env
)
fn
=
open
(
"%s/workerlog.%d"
%
(
log_dir
,
i
),
"w"
)
log_fns
.
append
(
fn
)
cmd
=
[
sys
.
executable
,
"-u"
,
entrypoint
]
+
entrypoint_args
procs
.
append
(
subprocess
.
Popen
(
cmd
,
stdout
=
fn
,
stderr
=
fn
,
env
=
curr_env
))
for
i
in
range
(
gpus
):
try
:
procs
[
i
].
communicate
()
procs
[
i
].
terminate
()
log_fns
[
i
].
close
()
except
:
pass
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
'''
)
parser
.
add_argument
(
'--gpus'
,
type
=
int
,
default
=
8
,
help
=
'start number of processes for every gpu'
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
"mylog"
,
help
=
'directory to put logs per process.'
)
parser
.
add_argument
(
'entrypoint_script'
,
type
=
str
,
help
=
"The entrypoint script to be launched in parallel,"
"followed by all the arguments for each process,"
"e.g. train.py --lr 0.1"
)
parser
.
add_argument
(
'entrypoint_args'
,
nargs
=
argparse
.
REMAINDER
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
# launch multiple training process
start_procs
(
args
.
gpus
,
args
.
entrypoint_script
,
args
.
entrypoint_args
,
args
.
log_dir
)
if
__name__
==
"__main__"
:
main
()
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
0 → 100644
浏览文件 @
e0bb8cce
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
os
.
environ
[
'RECORDIO_FILENAME'
]
=
'/tmp/ir_memory_optimize_transformer.wmt16.recordio'
from
test_parallel_executor_transformer
import
TestTransformer
from
test_parallel_executor_transformer
import
transformer
# NOTE(dzhwinter): test diferent strategy colisions.
# open the eager delete tensor strategy by default.
class
TestTransformerWithIR
(
TestTransformer
):
def
test_main
(
self
):
if
core
.
is_compiled_with_cuda
():
# check python transpiler
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
,
memory_opt
=
True
,
use_ir_memory_optimize
=
False
)
# check IR memory optimize
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
,
memory_opt
=
False
,
use_ir_memory_optimize
=
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录