Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
曾经的那一瞬间
Models
提交
b63030a3
M
Models
项目概览
曾经的那一瞬间
/
Models
大约 1 年 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
Models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b63030a3
编写于
10月 10, 2019
作者:
Y
Yeqing Li
提交者:
A. Unique TensorFlower
10月 10, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Perfzero RetinaLNet detection COCO AP test on 8 GPUs.
PiperOrigin-RevId: 274035928
上级
adc3e31f
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
237 addition
and
4 deletion
+237
-4
official/benchmark/retinanet_benchmark.py
official/benchmark/retinanet_benchmark.py
+231
-0
official/modeling/training/distributed_executor.py
official/modeling/training/distributed_executor.py
+4
-2
official/vision/detection/main.py
official/vision/detection/main.py
+2
-2
未找到文件。
official/benchmark/retinanet_benchmark.py
0 → 100644
浏览文件 @
b63030a3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes RetinaNet benchmarks and accuracy tests."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
# pylint: disable=g-bad-import-order
import
copy
import
json
import
os
import
time
from
absl
import
flags
from
absl.testing
import
flagsaver
import
tensorflow
as
tf
# pylint: enable=g-bad-import-order
from
official.utils.flags
import
core
as
flags_core
from
official.benchmark
import
bert_benchmark_utils
as
benchmark_utils
from
official.vision.detection
import
main
as
detection
FLAGS
=
flags
.
FLAGS
# pylint: disable=line-too-long
COCO_TRAIN_DATA
=
'gs://tf-perfzero-data/coco/train*'
COCO_EVAL_DATA
=
'gs://tf-perfzero-data/coco/val*'
COCO_EVAL_JSON
=
'gs://tf-perfzero-data/coco/instances_val2017.json'
RESNET_CHECKPOINT_PATH
=
'gs://cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07'
# pylint: enable=line-too-long
class
DetectionBenchmarkBase
(
tf
.
test
.
Benchmark
):
"""Base class to hold methods common to test classes."""
local_flags
=
None
def
__init__
(
self
,
output_dir
=
None
):
self
.
num_gpus
=
8
if
not
output_dir
:
output_dir
=
'/tmp'
self
.
output_dir
=
output_dir
self
.
timer_callback
=
None
def
_get_model_dir
(
self
,
folder_name
):
"""Returns directory to store info, e.g. saved model and event log."""
return
os
.
path
.
join
(
self
.
output_dir
,
folder_name
)
def
_setup
(
self
):
"""Sets up and resets flags before each test."""
self
.
timer_callback
=
benchmark_utils
.
BenchmarkTimerCallback
()
if
DetectionBenchmarkBase
.
local_flags
is
None
:
# Loads flags to get defaults to then override. List cannot be empty.
flags
.
FLAGS
([
'foo'
])
saved_flag_values
=
flagsaver
.
save_flag_values
()
DetectionBenchmarkBase
.
local_flags
=
saved_flag_values
else
:
flagsaver
.
restore_flag_values
(
DetectionBenchmarkBase
.
local_flags
)
def
_report_benchmark
(
self
,
stats
,
wall_time_sec
,
min_ap
,
max_ap
,
train_batch_size
=
None
):
"""Report benchmark results by writing to local protobuf file.
Args:
stats: dict returned from Detection models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
min_ap: Minimum detection AP constraint to verify correctness of the
model.
max_ap: Maximum detection AP accuracy constraint to verify correctness of
the model.
train_batch_size: Train batch size. It is needed for computing
exp_per_second.
"""
metrics
=
[{
'name'
:
'total_loss'
,
'value'
:
stats
[
'total_loss'
],
}]
if
self
.
timer_callback
:
metrics
.
append
({
'name'
:
'exp_per_second'
,
'value'
:
self
.
timer_callback
.
get_examples_per_sec
(
FLAGS
.
train_batch_size
)
})
else
:
metrics
.
append
({
'name'
:
'exp_per_second'
,
'value'
:
0.0
,
})
if
'eval_metrics'
in
stats
:
metrics
.
append
({
'name'
:
'AP'
,
'value'
:
stats
[
'AP'
],
'min_value'
:
min_ap
,
'max_value'
:
max_ap
,
})
flags_str
=
flags_core
.
get_nondefault_flags_as_str
()
self
.
report_benchmark
(
iters
=
stats
[
'total_steps'
],
wall_time
=
wall_time_sec
,
metrics
=
metrics
,
extras
=
{
'flags'
:
flags_str
})
class
RetinanetBenchmarkBase
(
DetectionBenchmarkBase
):
"""Base class to hold methods common to test classes in the module."""
def
__init__
(
self
,
output_dir
=
None
,
**
kwargs
):
self
.
train_data_path
=
COCO_TRAIN_DATA
self
.
eval_data_path
=
COCO_EVAL_DATA
self
.
eval_json_path
=
COCO_EVAL_JSON
self
.
resnet_checkpoint_path
=
RESNET_CHECKPOINT_PATH
super
(
RetinanetBenchmarkBase
,
self
).
__init__
(
output_dir
=
output_dir
)
def
_run_detection_main
(
self
):
"""Starts detection job."""
return
detection
.
main
(
'unused_argv'
)
class
RetinanetAccuracy
(
RetinanetBenchmarkBase
):
"""Accuracy test for RetinaNet model.
Tests RetinaNet detection task model accuracy. The naming
convention of below test cases follow
`benchmark_(number of gpus)_gpu_(dataset type)` format.
"""
def
__init__
(
self
,
output_dir
=
None
,
**
kwargs
):
super
(
RetinanetAccuracy
,
self
).
__init__
(
output_dir
=
output_dir
)
def
_run_and_report_benchmark
(
self
,
min_ap
=
0.325
,
max_ap
=
0.35
):
"""Starts RetinaNet accuracy benchmark test."""
start_time_sec
=
time
.
time
()
FLAGS
.
mode
=
'train'
summary
,
_
=
self
.
_run_detection_main
()
wall_time_sec
=
time
.
time
()
-
start_time_sec
FLAGS
.
mode
=
'eval'
eval_metrics
=
self
.
_run_detection_main
()
summary
.
update
(
eval_metrics
)
summary
[
'train_batch_size'
]
=
self
.
params_override
[
'train'
][
'batch_size'
]
summary
[
'total_steps'
]
=
self
.
params_override
[
'train'
][
'total_steps'
]
super
(
RetinanetAccuracy
,
self
).
_report_benchmark
(
stats
=
summary
,
wall_time_sec
=
wall_time_sec
,
min_ap
=
min_ap
,
max_ap
=
max_ap
)
def
_setup
(
self
):
super
(
RetinanetAccuracy
,
self
).
_setup
()
FLAGS
.
strategy_type
=
'mirrored'
FLAGS
.
model
=
'retinanet'
self
.
params_override
=
{
'train'
:
{
'batch_size'
:
64
,
'iterations_per_loop'
:
100
,
'total_steps'
:
22500
,
'train_file_pattern'
:
self
.
train_data_path
,
},
'eval'
:
{
'batch_size'
:
8
,
'eval_samples'
:
5000
,
'val_json_file'
:
self
.
eval_json_path
,
'eval_file_pattern'
:
self
.
eval_data_path
,
},
}
@
flagsaver
.
flagsaver
def
benchmark_8_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 8 GPUs."""
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params_override
)
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'benchmark_8_gpu_coco'
)
# Sets timer_callback to None as we do not use it now.
self
.
timer_callback
=
None
self
.
_run_and_report_benchmark
()
class
RetinanetBenchmarkReal
(
RetinanetAccuracy
):
"""Short benchmark performance tests for RetinaNet model.
Tests RetinaNet performance in different GPU configurations.
The naming convention of below test cases follow
`benchmark_(number of gpus)_gpu` format.
"""
def
__init__
(
self
,
output_dir
=
None
,
**
kwargs
):
super
(
RetinanetBenchmarkReal
,
self
).
__init__
(
output_dir
=
output_dir
)
@
flagsaver
.
flagsaver
def
benchmark_8_gpu_coco
(
self
):
"""Run RetinaNet model accuracy test with 8 GPUs."""
self
.
_setup
()
params
=
copy
.
deepcopy
(
self
.
params_override
)
params
[
'train'
][
'total_steps'
]
=
1875
# One epoch.
params
[
'train'
][
'iterations_per_loop'
]
=
125
params
[
'eval'
][
'eval_samples'
]
=
8
FLAGS
.
params_override
=
json
.
dumps
(
params
)
FLAGS
.
model_dir
=
self
.
_get_model_dir
(
'real_benchmark_8_gpu_coco'
)
# Sets timer_callback to None as we do not use it now.
self
.
timer_callback
=
None
self
.
_run_and_report_benchmark
()
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/modeling/training/distributed_executor.py
浏览文件 @
b63030a3
...
...
@@ -387,7 +387,7 @@ class DistributedExecutor(object):
save_config: bool. Whether to save params to model_dir.
Returns:
The train
ed keras model
.
The train
ing loss and eval metrics
.
"""
assert
train_input_fn
is
not
None
if
train_metric_fn
and
not
callable
(
train_metric_fn
):
...
...
@@ -407,6 +407,8 @@ class DistributedExecutor(object):
# To reduce unnecessary send/receive input pipeline operation, we place
# input pipeline ops in worker task.
train_iterator
=
self
.
_get_input_iterator
(
train_input_fn
,
strategy
)
train_loss
=
None
eval_metric_result
=
None
with
strategy
.
scope
():
# To correctly place the model weights on accelerators,
# model and optimizer should be created in scope.
...
...
@@ -520,7 +522,7 @@ class DistributedExecutor(object):
test_summary_writer
(
metrics
=
eval_metric_result
,
step
=
optimizer
.
iterations
)
return
model
return
train_loss
,
eval_metric_result
def
_run_evaluation
(
self
,
test_step
,
current_training_step
,
metric
,
test_iterator
):
...
...
official/vision/detection/main.py
浏览文件 @
b63030a3
...
...
@@ -121,7 +121,7 @@ def run_executor(params, train_input_fn=None, eval_input_fn=None):
logging
.
info
(
'Final eval metric %s: %f'
,
k
,
v
)
return
results
else
:
tf
.
logging
.
info
(
'Mode not found: %s.'
%
FLAGS
.
mode
)
raise
ValueError
(
'Mode not found: %s.'
%
FLAGS
.
mode
)
def
main
(
argv
):
...
...
@@ -170,7 +170,7 @@ def main(argv):
mode
=
input_reader
.
ModeKeys
.
PREDICT_WITH_GT
,
batch_size
=
params
.
eval
.
batch_size
,
num_examples
=
params
.
eval
.
eval_samples
)
run_executor
(
r
eturn
r
un_executor
(
params
,
train_input_fn
=
train_input_fn
,
eval_input_fn
=
eval_input_fn
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录