提交 c6ef3323 编写于 作者: Z zhengya01

add ce for video

上级 e84d9715
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${video_4:=0,1,2,3} # use 0,1,2,3-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model-name=TSN --config=./configs/tsn.txt --save-dir=checkpoints --epoch-num=6 --valid-interval=0 --enable_ce | python _ce.py
sleep 10
FLAGS_benchmark=true python train.py --model-name=AttentionCluster --config=./configs/attention_cluster.txt --save-dir=checkpoints --epoch-num=3 --valid-interval=0 --enable_ce | python _ce.py
sleep 10
cudaid=${video:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model-name=TSN --config=./configs/tsn.txt --save-dir=checkpoints --epoch-num=6 --valid-interval=0 --enable_ce | python _ce.py
sleep 10
FLAGS_benchmark=true python train.py --model-name=AttentionCluster --config=./configs/attention_cluster.txt --save-dir=checkpoints --epoch-num=3 --valid-interval=0 --enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
AttentionCluster_youtube8m_each_pass_duration_card1_kpi = DurationKpi('AttentionCluster_youtube8m_each_pass_duration_card1', 0.08, actived=True)
train_AttentionCluster_youtube8m_loss_card1_kpi = CostKpi('train_AttentionCluster_youtube8m_loss_card1', 0.08, actived=False)
train_AttentionCluster_youtube8m_hit_at_one_card1_kpi = CostKpi('train_AttentionCluster_youtube8m_hit_at_one_card1', 0.08, actived=False)
train_AttentionCluster_youtube8m_gap_card1_kpi = CostKpi('train_AttentionCluster_youtube8m_gap_card1', 0.08, actived=False)
train_AttentionCluster_youtube8m_perr_card1_kpi = AccKpi('train_AttentionCluster_youtube8m_perr_card1', 0.08, actived=False)
AttentionCluster_youtube8m_each_pass_duration_card4_kpi = DurationKpi('AttentionCluster_youtube8m_each_pass_duration_card4', 0.08, actived=True)
train_AttentionCluster_youtube8m_loss_card4_kpi = CostKpi('train_AttentionCluster_youtube8m_loss_card4', 0.08, actived=False)
train_AttentionCluster_youtube8m_hit_at_one_card4_kpi = CostKpi('train_AttentionCluster_youtube8m_hit_at_one_card4', 0.08, actived=False)
train_AttentionCluster_youtube8m_gap_card4_kpi = CostKpi('train_AttentionCluster_youtube8m_gap_card4', 0.08, actived=False)
train_AttentionCluster_youtube8m_perr_card4_kpi = AccKpi('train_AttentionCluster_youtube8m_perr_card4', 0.08, actived=False)
TSN_kinetics400_each_pass_duration_card1_kpi = DurationKpi('TSN_kinetics400_each_pass_duration_card1', 0.08, actived=True)
train_TSN_kinetics400_acc1_card1_kpi = AccKpi('train_TSN_kinetics400_acc1_card1', 0.08, actived=False)
train_TSN_kinetics400_acc5_card1_kpi = AccKpi('train_TSN_kinetics400_acc5_card1', 0.08, actived=False)
train_TSN_kinetics400_loss_card1_kpi = CostKpi('train_TSN_kinetics400_loss_card1', 0.08, actived=False)
TSN_kinetics400_each_pass_duration_card4_kpi = DurationKpi('TSN_kinetics400_each_pass_duration_card4', 0.08, actived=True)
train_TSN_kinetics400_acc1_card4_kpi = AccKpi('train_TSN_kinetics400_acc1_card4', 0.08, actived=False)
train_TSN_kinetics400_acc5_card4_kpi = AccKpi('train_TSN_kinetics400_acc5_card4', 0.08, actived=False)
train_TSN_kinetics400_loss_card4_kpi = CostKpi('train_TSN_kinetics400_loss_card4', 0.08, actived=False)
tracking_kpis = [
AttentionCluster_youtube8m_each_pass_duration_card1_kpi,
train_AttentionCluster_youtube8m_loss_card1_kpi,
train_AttentionCluster_youtube8m_hit_at_one_card1_kpi,
train_AttentionCluster_youtube8m_gap_card1_kpi,
train_AttentionCluster_youtube8m_perr_card1_kpi,
AttentionCluster_youtube8m_each_pass_duration_card4_kpi,
train_AttentionCluster_youtube8m_loss_card4_kpi,
train_AttentionCluster_youtube8m_hit_at_one_card4_kpi,
train_AttentionCluster_youtube8m_gap_card4_kpi,
train_AttentionCluster_youtube8m_perr_card4_kpi,
TSN_kinetics400_each_pass_duration_card1_kpi,
train_TSN_kinetics400_acc1_card1_kpi,
train_TSN_kinetics400_acc5_card1_kpi,
train_TSN_kinetics400_loss_card1_kpi,
TSN_kinetics400_each_pass_duration_card4_kpi,
train_TSN_kinetics400_acc1_card4_kpi,
train_TSN_kinetics400_acc5_card4_kpi,
train_TSN_kinetics400_loss_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
......@@ -36,6 +36,10 @@ class Metrics(object):
"""Not implemented"""
pass
def calculate(self, loss, pred, label, info=''):
"""Not implemented"""
pass
def accumulate(self, loss, pred, label, info=''):
"""Not implemented"""
pass
......@@ -67,6 +71,20 @@ class Youtube8mMetrics(Metrics):
logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\
'%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))
def calculate(self, loss, pred, label, info=''):
loss = np.mean(np.array(loss))
hit_at_one = youtube8m_metrics.calculate_hit_at_one(pred, label)
perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(pred,
label)
gap = youtube8m_metrics.calculate_gap(pred, label)
res = {'type': 'youtube8m',
'loss': loss,
'hit_at_one': hit_at_one,
'perr': perr,
'gap': gap,
}
return res
def accumulate(self, loss, pred, label, info=''):
self.calculator.accumulate(loss, pred, label)
......@@ -95,6 +113,19 @@ class Kinetics400Metrics(Metrics):
logger.info(info + '\tLoss: {},\ttop1_acc: {}, \ttop5_acc: {}'.format('%.6f' % loss, \
'%.2f' % acc1, '%.2f' % acc5))
def calculate(self, loss, pred, label, info=''):
if loss is not None:
loss = np.mean(np.array(loss))
else:
loss = 0.
acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)
res = {'type': 'kinetics400',
'loss': loss,
'acc1': acc1,
'acc5': acc5,
}
return res
def accumulate(self, loss, pred, label, info=''):
self.calculator.accumulate(loss, pred, label)
......@@ -140,6 +171,27 @@ class MulticropMetrics(Metrics):
logger.info(info + '\tLoss: {},\ttop1_acc: {}, \ttop5_acc: {}'.format('%.6f' % loss, \
'%.2f' % acc1, '%.2f' % acc5))
def calculate(self, loss, pred, label, info=''):
res = {'type': 'multicrop',
'loss': 0,
'acc1': 0,
'acc5': 0,
}
if self.mode == 'test':
pass
else:
if loss is not None:
loss = np.mean(np.array(loss))
else:
loss = 0.
acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)
res = {'type': 'multicrop',
'loss': loss,
'acc1': acc1,
'acc5': acc5,
}
return res
def accumulate(self, loss, pred, label):
self.calculator.accumulate(loss, pred, label)
......
......@@ -59,7 +59,10 @@ def train_without_pyreader(exe, train_prog, train_exe, train_reader, train_feede
train_fetch_list, train_metrics, epochs = 10, \
log_interval = 0, valid_interval = 0, save_dir = './', \
save_model_name = 'model', test_exe = None, test_reader = None, \
test_feeder = None, test_fetch_list = None, test_metrics = None):
test_feeder = None, test_fetch_list = None, test_metrics = None, \
model_name = '' ,enable_ce = True):
total_time = 0
ce_info = []
for epoch in range(epochs):
epoch_periods = []
for train_iter, data in enumerate(train_reader()):
......@@ -71,6 +74,8 @@ def train_without_pyreader(exe, train_prog, train_exe, train_reader, train_feede
loss = np.array(train_outs[0])
pred = np.array(train_outs[1])
label = np.array(train_outs[-1])
total_time += period
ce_info.append([loss, pred, label])
if log_interval > 0 and (train_iter % log_interval == 0):
# eval here
train_metrics.calculate_and_log_out(loss, pred, label, \
......@@ -83,7 +88,8 @@ def train_without_pyreader(exe, train_prog, train_exe, train_reader, train_feede
if test_exe and valid_interval > 0 and (epoch + 1) % valid_interval == 0:
test_without_pyreader(test_exe, test_reader, test_feeder,
test_fetch_list, test_metrics, log_interval)
if enable_ce:
print_ce_info(model_name, ce_info, total_time, epochs, train_metrics)
def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
......@@ -91,9 +97,12 @@ def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
log_interval = 0, valid_interval = 0, \
save_dir = './', save_model_name = 'model', \
test_exe = None, test_pyreader = None, \
test_fetch_list = None, test_metrics = None):
test_fetch_list = None, test_metrics = None, \
model_name = '', enable_ce = True):
if not train_pyreader:
logger.error("[TRAIN] get pyreader failed.")
total_time = 0
ce_info = []
for epoch in range(epochs):
train_pyreader.start()
train_metrics.reset()
......@@ -108,6 +117,8 @@ def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
loss = np.array(train_outs[0])
pred = np.array(train_outs[1])
label = np.array(train_outs[-1])
total_time += period
ce_info.append([loss, pred, label])
if log_interval > 0 and (train_iter % log_interval == 0):
# eval here
train_metrics.calculate_and_log_out(loss, pred, label, \
......@@ -125,6 +136,8 @@ def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
finally:
epoch_period = []
train_pyreader.reset()
if enable_ce:
print_ce_info(model_name, ce_info, total_time, epochs, train_metrics)
def save_model(exe, program, save_dir, model_name, postfix=None):
......@@ -132,3 +145,49 @@ def save_model(exe, program, save_dir, model_name, postfix=None):
if os.path.isdir(model_path):
shutil.rmtree(model_path)
fluid.io.save_persistables(exe, model_path, main_program=program)
def print_ce_info(model_name, ce_info, total_time, epochs, train_metrics):
gpu_num = get_cards()
ce_res = {}
try:
ce_loss = ce_info[-2][0]
ce_pred = ce_info[-2][1]
ce_label = ce_info[-2][2]
except:
logger.error('ce infor error')
ce_res = train_metrics.calculate(ce_loss, ce_pred, ce_label, info='ce')
if 'type' in ce_res:
ce_type = ce_res['type']
print("kpis\t%s_%s_each_pass_duration_card%s\t%s" %
(model_name, ce_type, gpu_num, total_time / epochs))
for k in ce_res:
if k == 'type':
continue
print('kpis\ttrain_%s_%s_%s_card%s\t%s' % (model_name, ce_type, k, gpu_num, ce_res[k]))
else:
ce_type = 'kinetics400'
ce_res = {'loss': 0, 'acc1': 0, 'acc5': 0}
print("kpis\t%s_%s_each_pass_duration_card%s\t%s" %
(model_name, ce_type, gpu_num, total_time / epochs))
for k in ce_res:
print('kpis\ttrain_%s_%s_%s_card%s\t%s' % (model_name, ce_type, k, gpu_num, ce_res[k]))
ce_type = 'multicrop'
ce_res = {'loss': 0, 'acc1': 0, 'acc5': 0}
print("kpis\t%s_%s_each_pass_duration_card%s\t%s" %
(model_name, ce_type, gpu_num, total_time / epochs))
for k in ce_res:
print('kpis\ttrain_%s_%s_%s_card%s\t%s' % (model_name, ce_type, k, gpu_num, ce_res[k]))
ce_type = 'youtube8m'
ce_res = {'loss': 0, 'hit_at_one': 0, 'perr': 0, 'gap': 0}
print("kpis\t%s_%s_each_pass_duration_card%s\t%s" %
(model_name, ce_type, gpu_num, total_time / epochs))
for k in ce_res:
print('kpis\ttrain_%s_%s_%s_card%s\t%s' % (model_name, ce_type, k, gpu_num, ce_res[k]))
def get_cards():
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
......@@ -99,6 +99,10 @@ def parse_args():
type=int,
default=10,
help='mini-batch interval to log.')
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
args = parser.parse_args()
return args
......@@ -114,6 +118,10 @@ def train(args):
# build model
startup = fluid.Program()
train_prog = fluid.Program()
if args.enable_ce:
SEED = 102
startup.random_seed = SEED
train_prog.random_seed = SEED
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
train_model.build_input(not args.no_use_pyreader)
......@@ -204,7 +212,8 @@ def train(args):
log_interval = args.log_interval, valid_interval = args.valid_interval,
save_dir = args.save_dir, save_model_name = args.model_name,
test_exe = valid_exe, test_reader = valid_reader, test_feeder = valid_feeder,
test_fetch_list = valid_fetch_list, test_metrics = valid_metrics)
test_fetch_list = valid_fetch_list, test_metrics = valid_metrics,
model_name = args.model_name, enable_ce=args.enable_ce)
else:
train_pyreader.decorate_paddle_reader(train_reader)
valid_pyreader.decorate_paddle_reader(valid_reader)
......@@ -213,7 +222,8 @@ def train(args):
valid_interval = args.valid_interval,
save_dir = args.save_dir, save_model_name = args.model_name,
test_exe = valid_exe, test_pyreader = valid_pyreader,
test_fetch_list = valid_fetch_list, test_metrics = valid_metrics)
test_fetch_list = valid_fetch_list, test_metrics = valid_metrics,
model_name = args.model_name, enable_ce=args.enable_ce)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册