提交 1494c9a3 编写于 作者: Z zhengya01 提交者: jerrywgz

Add ce (#1603)

* add ce for faster_rcnn

* add ce for faster_rcnn

* add ce

* add ce

* update faster_rcnn

* update faster_rcnn
上级 665de4a3
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${face_detection:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
each_pass_duration_card4_kpi,
train_loss_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -35,7 +35,7 @@ def train(): ...@@ -35,7 +35,7 @@ def train():
learning_rate = cfg.learning_rate learning_rate = cfg.learning_rate
image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
if cfg.debug: if cfg.debug or cfg.enable_ce:
fluid.default_startup_program().random_seed = 1000 fluid.default_startup_program().random_seed = 1000
fluid.default_main_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000
import random import random
...@@ -46,11 +46,14 @@ def train(): ...@@ -46,11 +46,14 @@ def train():
devices_num = len(devices.split(",")) devices_num = len(devices.split(","))
total_batch_size = devices_num * cfg.TRAIN.im_per_batch total_batch_size = devices_num * cfg.TRAIN.im_per_batch
use_random = True
if cfg.enable_ce:
use_random = False
model = model_builder.FasterRCNN( model = model_builder.FasterRCNN(
add_conv_body_func=resnet.add_ResNet50_conv4_body, add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=cfg.use_pyreader, use_pyreader=cfg.use_pyreader,
use_random=True) use_random=use_random)
model.build_model(image_shape) model.build_model(image_shape)
loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss() loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss()
loss_cls.persistable = True loss_cls.persistable = True
...@@ -92,16 +95,19 @@ def train(): ...@@ -92,16 +95,19 @@ def train():
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
use_cuda=bool(cfg.use_gpu), loss_name=loss.name) use_cuda=bool(cfg.use_gpu), loss_name=loss.name)
shuffle = True
if cfg.enable_ce:
shuffle = False
if cfg.use_pyreader: if cfg.use_pyreader:
train_reader = reader.train( train_reader = reader.train(
batch_size=cfg.TRAIN.im_per_batch, batch_size=cfg.TRAIN.im_per_batch,
total_batch_size=total_batch_size, total_batch_size=total_batch_size,
padding_total=cfg.TRAIN.padding_minibatch, padding_total=cfg.TRAIN.padding_minibatch,
shuffle=True) shuffle=shuffle)
py_reader = model.py_reader py_reader = model.py_reader
py_reader.decorate_paddle_reader(train_reader) py_reader.decorate_paddle_reader(train_reader)
else: else:
train_reader = reader.train(batch_size=total_batch_size, shuffle=True) train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle)
feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
def save_model(postfix): def save_model(postfix):
...@@ -118,6 +124,8 @@ def train(): ...@@ -118,6 +124,8 @@ def train():
try: try:
start_time = time.time() start_time = time.time()
prev_start_time = start_time prev_start_time = start_time
total_time = 0
last_loss = 0
every_pass_loss = [] every_pass_loss = []
for iter_id in range(cfg.max_iter): for iter_id in range(cfg.max_iter):
prev_start_time = start_time prev_start_time = start_time
...@@ -131,9 +139,23 @@ def train(): ...@@ -131,9 +139,23 @@ def train():
iter_id, lr[0], iter_id, lr[0],
smoothed_loss.get_median_value( smoothed_loss.get_median_value(
), start_time - prev_start_time)) ), start_time - prev_start_time))
end_time = time.time()
total_time += end_time - start_time
last_loss = np.mean(np.array(losses[0]))
sys.stdout.flush() sys.stdout.flush()
if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
save_model("model_iter{}".format(iter_id)) save_model("model_iter{}".format(iter_id))
# only for ce
if cfg.enable_ce:
gpu_num = devices_num
epoch_idx = iter_id + 1
loss = last_loss
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, loss))
except fluid.core.EOFException: except fluid.core.EOFException:
py_reader.reset() py_reader.reset()
return np.mean(every_pass_loss) return np.mean(every_pass_loss)
...@@ -142,6 +164,8 @@ def train(): ...@@ -142,6 +164,8 @@ def train():
start_time = time.time() start_time = time.time()
prev_start_time = start_time prev_start_time = start_time
start = start_time start = start_time
total_time = 0
last_loss = 0
every_pass_loss = [] every_pass_loss = []
smoothed_loss = SmoothedValue(cfg.log_window) smoothed_loss = SmoothedValue(cfg.log_window)
for iter_id, data in enumerate(train_reader()): for iter_id, data in enumerate(train_reader()):
...@@ -154,6 +178,9 @@ def train(): ...@@ -154,6 +178,9 @@ def train():
smoothed_loss.add_value(loss_v) smoothed_loss.add_value(loss_v)
lr = np.array(fluid.global_scope().find_var('learning_rate') lr = np.array(fluid.global_scope().find_var('learning_rate')
.get_tensor()) .get_tensor())
end_time = time.time()
total_time += end_time - start_time
last_loss = loss_v
print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format( print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
iter_id, lr[0], iter_id, lr[0],
smoothed_loss.get_median_value(), start_time - prev_start_time)) smoothed_loss.get_median_value(), start_time - prev_start_time))
...@@ -162,6 +189,16 @@ def train(): ...@@ -162,6 +189,16 @@ def train():
save_model("model_iter{}".format(iter_id)) save_model("model_iter{}".format(iter_id))
if (iter_id + 1) == cfg.max_iter: if (iter_id + 1) == cfg.max_iter:
break break
# only for ce
if cfg.enable_ce:
gpu_num = devices_num
epoch_idx = iter_id + 1
loss = last_loss
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, loss))
return np.mean(every_pass_loss) return np.mean(every_pass_loss)
if cfg.use_pyreader: if cfg.use_pyreader:
......
...@@ -129,6 +129,9 @@ def parse_args(): ...@@ -129,6 +129,9 @@ def parse_args():
add_arg('draw_threshold', float, 0.8, "Confidence threshold to draw bbox.") add_arg('draw_threshold', float, 0.8, "Confidence threshold to draw bbox.")
add_arg('image_path', str, 'data/COCO17/val2017', "The image path used to inference and visualize.") add_arg('image_path', str, 'data/COCO17/val2017', "The image path used to inference and visualize.")
add_arg('image_name', str, '', "The single image used to inference and visualize.") add_arg('image_name', str, '', "The single image used to inference and visualize.")
# ce
parser.add_argument(
'--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
# yapf: enable # yapf: enable
args = parser.parse_args() args = parser.parse_args()
file_name = sys.argv[0] file_name = sys.argv[0]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册