Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
FL1623863129
YOLOX
提交
f5331eaa
Y
YOLOX
项目概览
FL1623863129
/
YOLOX
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Y
YOLOX
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
f5331eaa
编写于
7月 29, 2021
作者:
S
Songtao Liu
提交者:
GitHub
7月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug of multi-machine training (#240)
fix(tools): fix bug of mm training
上级
d776311e
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
38 addition
and
32 deletion
+38
-32
README.md
README.md
+6
-0
tools/eval.py
tools/eval.py
+5
-10
tools/train.py
tools/train.py
+5
-7
yolox/core/launch.py
yolox/core/launch.py
+21
-9
yolox/core/trainer.py
yolox/core/trainer.py
+0
-5
yolox/evaluators/coco_evaluator.py
yolox/evaluators/coco_evaluator.py
+1
-1
未找到文件。
README.md
浏览文件 @
f5331eaa
...
...
@@ -106,6 +106,12 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
*
-b: total batch size, the recommended number for -b is num-gpu
*
8
*
--fp16: mixed precision training
**Multi Machine Training**
We also support multi-nodes training. Just add the following args:
*
--num
\_
machines: num of your total training nodes
*
--machine
\_
rank: specify the rank of each node
When using -f, the above commands are equivalent to:
```
shell
...
...
tools/eval.py
浏览文件 @
f5331eaa
...
...
@@ -41,7 +41,7 @@ def make_parser():
"--local_rank"
,
default
=
0
,
type
=
int
,
help
=
"local rank for dist training"
)
parser
.
add_argument
(
"--num_machine"
,
default
=
1
,
type
=
int
,
help
=
"num of node for training"
"--num_machine
s
"
,
default
=
1
,
type
=
int
,
help
=
"num of node for training"
)
parser
.
add_argument
(
"--machine_rank"
,
default
=
0
,
type
=
int
,
help
=
"node rank for multi-node training"
...
...
@@ -104,9 +104,6 @@ def make_parser():
@
logger
.
catch
def
main
(
exp
,
args
,
num_gpu
):
if
not
args
.
experiment_name
:
args
.
experiment_name
=
exp
.
exp_name
if
args
.
seed
is
not
None
:
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
...
...
@@ -118,16 +115,11 @@ def main(exp, args, num_gpu):
is_distributed
=
num_gpu
>
1
# set environment variables for distributed training
configure_nccl
()
cudnn
.
benchmark
=
True
rank
=
args
.
local_rank
# rank = get_local_rank()
if
rank
==
0
:
if
os
.
path
.
exists
(
"./"
+
args
.
experiment_name
+
"ip_add.txt"
):
os
.
remove
(
"./"
+
args
.
experiment_name
+
"ip_add.txt"
)
file_name
=
os
.
path
.
join
(
exp
.
output_dir
,
args
.
experiment_name
)
if
rank
==
0
:
...
...
@@ -198,13 +190,16 @@ if __name__ == "__main__":
exp
=
get_exp
(
args
.
exp_file
,
args
.
name
)
exp
.
merge
(
args
.
opts
)
if
not
args
.
experiment_name
:
args
.
experiment_name
=
exp
.
exp_name
num_gpu
=
torch
.
cuda
.
device_count
()
if
args
.
devices
is
None
else
args
.
devices
assert
num_gpu
<=
torch
.
cuda
.
device_count
()
launch
(
main
,
num_gpu
,
args
.
num_machine
,
args
.
num_machine
s
,
args
.
machine_rank
,
backend
=
args
.
dist_backend
,
dist_url
=
args
.
dist_url
,
...
...
tools/train.py
浏览文件 @
f5331eaa
...
...
@@ -9,7 +9,6 @@ import torch.backends.cudnn as cudnn
from
yolox.core
import
Trainer
,
launch
from
yolox.exp
import
get_exp
from
yolox.utils
import
configure_nccl
import
argparse
import
random
...
...
@@ -57,7 +56,7 @@ def make_parser():
help
=
"resume training start epoch"
,
)
parser
.
add_argument
(
"--num_machine"
,
default
=
1
,
type
=
int
,
help
=
"num of node for training"
"--num_machine
s
"
,
default
=
1
,
type
=
int
,
help
=
"num of node for training"
)
parser
.
add_argument
(
"--machine_rank"
,
default
=
0
,
type
=
int
,
help
=
"node rank for multi-node training"
...
...
@@ -88,9 +87,6 @@ def make_parser():
@
logger
.
catch
def
main
(
exp
,
args
):
if
not
args
.
experiment_name
:
args
.
experiment_name
=
exp
.
exp_name
if
exp
.
seed
is
not
None
:
random
.
seed
(
exp
.
seed
)
torch
.
manual_seed
(
exp
.
seed
)
...
...
@@ -102,7 +98,6 @@ def main(exp, args):
)
# set environment variables for distributed training
configure_nccl
()
cudnn
.
benchmark
=
True
trainer
=
Trainer
(
exp
,
args
)
...
...
@@ -114,13 +109,16 @@ if __name__ == "__main__":
exp
=
get_exp
(
args
.
exp_file
,
args
.
name
)
exp
.
merge
(
args
.
opts
)
if
not
args
.
experiment_name
:
args
.
experiment_name
=
exp
.
exp_name
num_gpu
=
torch
.
cuda
.
device_count
()
if
args
.
devices
is
None
else
args
.
devices
assert
num_gpu
<=
torch
.
cuda
.
device_count
()
launch
(
main
,
num_gpu
,
args
.
num_machine
,
args
.
num_machine
s
,
args
.
machine_rank
,
backend
=
args
.
dist_backend
,
dist_url
=
args
.
dist_url
,
...
...
yolox/core/launch.py
浏览文件 @
f5331eaa
...
...
@@ -12,6 +12,7 @@ import torch.distributed as dist
import
torch.multiprocessing
as
mp
import
yolox.utils.dist
as
comm
from
yolox.utils
import
configure_nccl
import
os
import
subprocess
...
...
@@ -63,11 +64,13 @@ def launch(
os
.
environ
.
get
(
"MASTER_PORT"
,
"None"
),
)
local_rank
=
int
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"0"
))
world_size
=
int
(
os
.
environ
.
get
(
"WORLD_SIZE"
,
"1"
))
_distributed_worker
(
local_rank
,
main_func
,
world_size
,
num_gpus_per_machine
,
num_machines
,
machine_rank
,
backend
,
dist_url
,
...
...
@@ -99,29 +102,30 @@ def launch_by_subprocess(
assert
(
world_size
>
1
),
"subprocess mode doesn't support single GPU, use spawn mode instead"
machine_rank
=
int
(
os
.
getenv
(
"RLAUNCH_REPLICA"
,
machine_rank
))
if
dist_url
is
None
:
master_ip
=
subprocess
.
check_output
([
"hostname"
,
"--fqdn"
]).
decode
(
"utf-8"
)
master_ip
=
str
(
master_ip
).
strip
()
dist_url
=
"tcp://{}"
.
format
(
master_ip
)
# ------------------------hack for multi-machine training -------------------- #
if
num_machines
>
1
:
ip_add_file
=
"./"
+
args
[
1
].
experiment_name
+
"ip_add.txt"
master_ip
=
subprocess
.
check_output
([
"hostname"
,
"--fqdn"
]).
decode
(
"utf-8"
)
master_ip
=
str
(
master_ip
).
strip
()
dist_url
=
"tcp://{}"
.
format
(
master_ip
)
ip_add_file
=
"./"
+
args
[
1
].
experiment_name
+
"_ip_add.txt"
if
machine_rank
==
0
:
port
=
_find_free_port
()
with
open
(
ip_add_file
,
"w"
)
as
ip_add
:
ip_add
.
write
(
dist_url
)
ip_add
.
write
(
dist_url
+
'
\n
'
)
ip_add
.
write
(
str
(
port
))
else
:
while
not
os
.
path
.
exists
(
ip_add_file
):
time
.
sleep
(
0.5
)
with
open
(
ip_add_file
,
"r"
)
as
ip_add
:
dist_url
=
ip_add
.
readline
()
dist_url
=
ip_add
.
readline
().
strip
()
port
=
ip_add
.
readline
()
else
:
dist_url
=
"tcp://127.0.0.1"
port
=
_find_free_port
()
port
=
_find_free_port
()
# set PyTorch distributed related environmental variables
current_env
=
os
.
environ
.
copy
()
current_env
[
"MASTER_ADDR"
]
=
dist_url
...
...
@@ -166,6 +170,7 @@ def _distributed_worker(
main_func
,
world_size
,
num_gpus_per_machine
,
num_machines
,
machine_rank
,
backend
,
dist_url
,
...
...
@@ -174,6 +179,7 @@ def _distributed_worker(
assert
(
torch
.
cuda
.
is_available
()
),
"cuda is not available. Please check your installation."
configure_nccl
()
global_rank
=
machine_rank
*
num_gpus_per_machine
+
local_rank
logger
.
info
(
"Rank {} initialization finished."
.
format
(
global_rank
))
try
:
...
...
@@ -190,10 +196,16 @@ def _distributed_worker(
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
comm
.
synchronize
()
if
global_rank
==
0
and
os
.
path
.
exists
(
"./"
+
args
[
1
].
experiment_name
+
"_ip_add.txt"
):
os
.
remove
(
"./"
+
args
[
1
].
experiment_name
+
"_ip_add.txt"
)
assert
num_gpus_per_machine
<=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
local_rank
)
args
[
1
].
local_rank
=
local_rank
args
[
1
].
num_machines
=
num_machines
# Setup the local process group (which contains ranks within the same machine)
# assert comm._LOCAL_PROCESS_GROUP is None
...
...
yolox/core/trainer.py
浏览文件 @
f5331eaa
...
...
@@ -55,11 +55,6 @@ class Trainer:
self
.
meter
=
MeterBuffer
(
window_size
=
exp
.
print_interval
)
self
.
file_name
=
os
.
path
.
join
(
exp
.
output_dir
,
args
.
experiment_name
)
if
self
.
rank
==
0
and
os
.
path
.
exists
(
"./"
+
args
.
experiment_name
+
"ip_add.txt"
):
os
.
remove
(
"./"
+
args
.
experiment_name
+
"ip_add.txt"
)
if
self
.
rank
==
0
:
os
.
makedirs
(
self
.
file_name
,
exist_ok
=
True
)
...
...
yolox/evaluators/coco_evaluator.py
浏览文件 @
f5331eaa
...
...
@@ -206,7 +206,7 @@ class COCOEvaluator:
try
:
from
yolox.layers
import
COCOeval_opt
as
COCOeval
except
ImportError
:
from
.cocoeval_mr
import
COCOeval
from
pycocotools
import
cocoeval
as
COCOeval
logger
.
warning
(
"Use standard COCOeval."
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录