Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
weixin_41840029
PaddleOCR
提交
3ffaf7f2
P
PaddleOCR
项目概览
weixin_41840029
/
PaddleOCR
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleOCR
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3ffaf7f2
编写于
12月 18, 2021
作者:
文幕地方
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add distributed train support
上级
b069a091
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
71 addition
and
72 deletion
+71
-72
ppstructure/vqa/train_re.py
ppstructure/vqa/train_re.py
+34
-31
ppstructure/vqa/train_ser.py
ppstructure/vqa/train_ser.py
+37
-41
未找到文件。
ppstructure/vqa/train_re.py
浏览文件 @
3ffaf7f2
...
...
@@ -36,6 +36,9 @@ from ppocr.utils.logging import get_logger
def
train
(
args
):
logger
=
get_logger
(
log_file
=
os
.
path
.
join
(
args
.
output_dir
,
"train.log"
))
rank
=
paddle
.
distributed
.
get_rank
()
distributed
=
paddle
.
distributed
.
get_world_size
()
>
1
print_arguments
(
args
,
logger
)
# Added here for reproducibility (even between python 2 and 3)
...
...
@@ -45,7 +48,7 @@ def train(args):
pad_token_label_id
=
paddle
.
nn
.
CrossEntropyLoss
().
ignore_index
# dist mode
if
paddle
.
distributed
.
get_world_size
()
>
1
:
if
distributed
:
paddle
.
distributed
.
init_parallel_env
()
tokenizer
=
LayoutXLMTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
...
...
@@ -59,8 +62,8 @@ def train(args):
args
.
model_name_or_path
)
# dist mode
if
paddle
.
distributed
.
get_world_size
()
>
1
:
model
=
paddle
.
distributed
.
DataParallel
(
model
)
if
distributed
:
model
=
paddle
.
DataParallel
(
model
)
train_dataset
=
XFUNDataset
(
tokenizer
,
...
...
@@ -90,8 +93,7 @@ def train(args):
train_sampler
=
paddle
.
io
.
DistributedBatchSampler
(
train_dataset
,
batch_size
=
args
.
per_gpu_train_batch_size
,
shuffle
=
True
)
args
.
train_batch_size
=
args
.
per_gpu_train_batch_size
*
\
max
(
1
,
paddle
.
distributed
.
get_world_size
())
train_dataloader
=
paddle
.
io
.
DataLoader
(
train_dataset
,
batch_sampler
=
train_sampler
,
...
...
@@ -136,7 +138,8 @@ def train(args):
args
.
per_gpu_train_batch_size
))
logger
.
info
(
" Total train batch size (w. parallel, distributed & accumulation) = {}"
.
format
(
args
.
train_batch_size
*
paddle
.
distributed
.
get_world_size
()))
format
(
args
.
per_gpu_train_batch_size
*
paddle
.
distributed
.
get_world_size
()))
logger
.
info
(
" Total optimization steps = {}"
.
format
(
t_total
))
global_step
=
0
...
...
@@ -170,7 +173,7 @@ def train(args):
global_step
+=
1
total_samples
+=
batch
[
'image'
].
shape
[
0
]
if
step
%
print_step
==
0
:
if
rank
==
0
and
step
%
print_step
==
0
:
logger
.
info
(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec"
.
format
(
epoch
,
args
.
num_train_epochs
,
step
,
...
...
@@ -185,38 +188,38 @@ def train(args):
train_run_cost
=
0.0
total_samples
=
0
if
(
paddle
.
distributed
.
get_rank
()
==
0
and
args
.
eval_steps
>
0
and
global_step
%
args
.
eval_steps
==
0
):
if
rank
==
0
and
args
.
eval_steps
>
0
and
global_step
%
args
.
eval_steps
==
0
and
args
.
evaluate_during_training
:
# Log metrics
if
(
paddle
.
distributed
.
get_rank
()
==
0
and
args
.
evaluate_during_training
):
# Only evaluate when single GPU otherwise metrics may not average well
# Only evaluate when single GPU otherwise metrics may not average well
results
=
evaluate
(
model
,
eval_dataloader
,
logger
)
if
results
[
'f1'
]
>=
best_metirc
[
'f1'
]:
best_metirc
=
results
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"best_model"
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
if
distributed
:
model
.
_layers
.
save_pretrained
(
output_dir
)
else
:
model
.
save_pretrained
(
output_dir
)
tokenizer
.
save_pretrained
(
output_dir
)
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
logger
.
info
(
"Saving model checkpoint to {}"
.
format
(
output_dir
))
logger
.
info
(
"eval results: {}"
.
format
(
results
))
logger
.
info
(
"best_metirc: {}"
.
format
(
best_metirc
))
reader_start
=
time
.
time
()
if
paddle
.
distributed
.
get_rank
()
==
0
:
if
rank
==
0
:
# Save model checkpoint
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"latest_model"
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
if
paddle
.
distributed
.
get_rank
()
==
0
:
if
distributed
:
model
.
_layers
.
save_pretrained
(
output_dir
)
else
:
model
.
save_pretrained
(
output_dir
)
tokenizer
.
save_pretrained
(
output_dir
)
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
logger
.
info
(
"Saving model checkpoint to {}"
.
format
(
output_dir
))
reader_start
=
time
.
time
()
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
logger
.
info
(
"Saving model checkpoint to {}"
.
format
(
output_dir
))
logger
.
info
(
"best_metirc: {}"
.
format
(
best_metirc
))
...
...
ppstructure/vqa/train_ser.py
浏览文件 @
3ffaf7f2
...
...
@@ -37,6 +37,9 @@ from ppocr.utils.logging import get_logger
def
train
(
args
):
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
rank
=
paddle
.
distributed
.
get_rank
()
distributed
=
paddle
.
distributed
.
get_world_size
()
>
1
logger
=
get_logger
(
log_file
=
os
.
path
.
join
(
args
.
output_dir
,
"train.log"
))
print_arguments
(
args
,
logger
)
...
...
@@ -44,7 +47,7 @@ def train(args):
pad_token_label_id
=
paddle
.
nn
.
CrossEntropyLoss
().
ignore_index
# dist mode
if
paddle
.
distributed
.
get_world_size
()
>
1
:
if
distributed
:
paddle
.
distributed
.
init_parallel_env
()
tokenizer
=
LayoutXLMTokenizer
.
from_pretrained
(
args
.
model_name_or_path
)
...
...
@@ -59,7 +62,7 @@ def train(args):
args
.
model_name_or_path
)
# dist mode
if
paddle
.
distributed
.
get_world_size
()
>
1
:
if
distributed
:
model
=
paddle
.
DataParallel
(
model
)
train_dataset
=
XFUNDataset
(
...
...
@@ -88,9 +91,6 @@ def train(args):
train_sampler
=
paddle
.
io
.
DistributedBatchSampler
(
train_dataset
,
batch_size
=
args
.
per_gpu_train_batch_size
,
shuffle
=
True
)
args
.
train_batch_size
=
args
.
per_gpu_train_batch_size
*
max
(
1
,
paddle
.
distributed
.
get_world_size
())
train_dataloader
=
paddle
.
io
.
DataLoader
(
train_dataset
,
batch_sampler
=
train_sampler
,
...
...
@@ -134,7 +134,7 @@ def train(args):
args
.
per_gpu_train_batch_size
)
logger
.
info
(
" Total train batch size (w. parallel, distributed) = %d"
,
args
.
train_batch_size
*
paddle
.
distributed
.
get_world_size
(),
)
args
.
per_gpu_
train_batch_size
*
paddle
.
distributed
.
get_world_size
(),
)
logger
.
info
(
" Total optimization steps = %d"
,
t_total
)
global_step
=
0
...
...
@@ -168,7 +168,7 @@ def train(args):
global_step
+=
1
total_samples
+=
batch
[
'image'
].
shape
[
0
]
if
step
%
print_step
==
0
:
if
rank
==
0
and
step
%
print_step
==
0
:
logger
.
info
(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec"
.
format
(
epoch_id
,
args
.
num_train_epochs
,
step
,
...
...
@@ -183,47 +183,43 @@ def train(args):
train_run_cost
=
0.0
total_samples
=
0
if
(
paddle
.
distributed
.
get_rank
()
==
0
and
args
.
eval_steps
>
0
and
global_step
%
args
.
eval_steps
==
0
):
if
rank
==
0
and
args
.
eval_steps
>
0
and
global_step
%
args
.
eval_steps
==
0
and
args
.
evaluate_during_training
:
# Log metrics
# Only evaluate when single GPU otherwise metrics may not average well
if
paddle
.
distributed
.
get_rank
(
)
==
0
and
args
.
evaluate_during_training
:
results
,
_
=
evaluate
(
args
,
model
,
tokenizer
,
eval_dataloader
,
label2id_map
,
id2label_map
,
pad_token_label_id
,
logger
)
if
best_metrics
is
None
or
results
[
"f1"
]
>=
best_metrics
[
"f1"
]:
results
,
_
=
evaluate
(
args
,
model
,
tokenizer
,
eval_dataloader
,
label2id_map
,
id2label_map
,
pad_token_label_id
,
logger
)
if
best_metrics
is
None
or
results
[
"f1"
]
>=
best_metrics
[
"f1"
]:
best_metrics
=
copy
.
deepcopy
(
results
)
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"best_model"
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
if
paddle
.
distributed
.
get_rank
()
==
0
:
if
distributed
:
model
.
_layers
.
save_pretrained
(
output_dir
)
else
:
model
.
save_pretrained
(
output_dir
)
tokenizer
.
save_pretrained
(
output_dir
)
paddle
.
save
(
args
,
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
logger
.
info
(
"Saving model checkpoint to %s"
,
output_dir
)
logger
.
info
(
"Saving model checkpoint to %s"
,
output_dir
)
logger
.
info
(
"[epoch {}/{}][iter: {}/{}] results: {}"
.
format
(
epoch_id
,
args
.
num_train_epochs
,
step
,
len
(
train_dataloader
),
results
))
if
best_metrics
is
not
None
:
logger
.
info
(
"best metrics: {}"
.
format
(
best_metrics
))
if
paddle
.
distributed
.
get_rank
()
==
0
:
reader_start
=
time
.
time
()
if
rank
==
0
:
# Save model checkpoint
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"latest_model"
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
if
paddle
.
distributed
.
get_rank
()
==
0
:
if
distributed
:
model
.
_layers
.
save_pretrained
(
output_dir
)
else
:
model
.
save_pretrained
(
output_dir
)
tokenizer
.
save_pretrained
(
output_dir
)
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
paddle
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"training_args.bin"
))
logger
.
info
(
"Saving model checkpoint to %s"
,
output_dir
)
reader_start
=
time
.
time
()
return
global_step
,
tr_loss
/
global_step
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录