提交 6d72139a 编写于 作者: G guosheng

Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix-ce-transformer-data

#!/bin/bash
# This file is only used for continuous evaluation.
rm -rf *_factor.txt
python train.py --use_gpu=True 1> log
cat log | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
# NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True)
tracking_kpis = [
train_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -20,12 +20,12 @@ add_arg('use_gpu', bool, True, "Whether use GPU to test.") ...@@ -20,12 +20,12 @@ add_arg('use_gpu', bool, True, "Whether use GPU to test.")
def cal_mean_iou(wrong, correct): def cal_mean_iou(wrong, correct):
sum = wrong + cerroct sum = wrong + correct
true_num = (sum != 0).sum() true_num = (sum != 0).sum()
for i in len(sum): for i in range(len(sum)):
if sum[i] == 0: if sum[i] == 0:
sum[i] = 1 sum[i] = 1
return (cerroct.astype("float64") / sum).sum() / true_num return (correct.astype("float64") / sum).sum() / true_num
def create_iou(predict, label, mask, num_classes, image_shape): def create_iou(predict, label, mask, num_classes, image_shape):
...@@ -84,6 +84,7 @@ def eval(args): ...@@ -84,6 +84,7 @@ def eval(args):
sys.stdout.flush() sys.stdout.flush()
iou = cal_mean_iou(out_wrong, out_right) iou = cal_mean_iou(out_wrong, out_right)
print "\nmean iou: %.3f" % iou print "\nmean iou: %.3f" % iou
print "kpis test_acc %f" % iou
def main(): def main():
......
...@@ -184,7 +184,7 @@ def res_block(input, filter_num, padding=0, dilation=None, name=None): ...@@ -184,7 +184,7 @@ def res_block(input, filter_num, padding=0, dilation=None, name=None):
tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase") tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase")
tmp = bn(tmp, relu=False) tmp = bn(tmp, relu=False)
tmp = input + tmp tmp = input + tmp
tmp = fluid.layers.relu(tmp, name=name + "_relu") tmp = fluid.layers.relu(tmp)
return tmp return tmp
...@@ -227,7 +227,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1, ...@@ -227,7 +227,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase") tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase")
tmp = bn(tmp, relu=False) tmp = bn(tmp, relu=False)
tmp = proj_bn + tmp tmp = proj_bn + tmp
tmp = fluid.layers.relu(tmp, name=name + "_relu") tmp = fluid.layers.relu(tmp)
return tmp return tmp
......
...@@ -11,6 +11,10 @@ from utils import add_arguments, print_arguments, get_feeder_data ...@@ -11,6 +11,10 @@ from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu from paddle.fluid.initializer import init_on_cpu
SEED = 90
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
...@@ -27,9 +31,9 @@ LAMBDA2 = 0.4 ...@@ -27,9 +31,9 @@ LAMBDA2 = 0.4
LAMBDA3 = 1.0 LAMBDA3 = 1.0
LEARNING_RATE = 0.003 LEARNING_RATE = 0.003
POWER = 0.9 POWER = 0.9
LOG_PERIOD = 1 LOG_PERIOD = 100
CHECKPOINT_PERIOD = 1000 CHECKPOINT_PERIOD = 100
TOTAL_STEP = 60000 TOTAL_STEP = 100
no_grad_set = [] no_grad_set = []
...@@ -97,10 +101,13 @@ def train(args): ...@@ -97,10 +101,13 @@ def train(args):
sub124_loss = 0. sub124_loss = 0.
train_reader = cityscape.train( train_reader = cityscape.train(
args.batch_size, flip=args.random_mirror, scaling=args.random_scaling) args.batch_size, flip=args.random_mirror, scaling=args.random_scaling)
start_time = time.time()
while True: while True:
# train a pass # train a pass
for data in train_reader(): for data in train_reader():
if iter_id > TOTAL_STEP: if iter_id > TOTAL_STEP:
end_time = time.time()
print "kpis train_duration %f" % (end_time - start_time)
return return
iter_id += 1 iter_id += 1
results = exe.run( results = exe.run(
...@@ -115,13 +122,15 @@ def train(args): ...@@ -115,13 +122,15 @@ def train(args):
print "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % ( print "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % (
iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD, iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD,
sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD) sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD)
print "kpis train_cost %f" % (t_loss / LOG_PERIOD)
t_loss = 0. t_loss = 0.
sub4_loss = 0. sub4_loss = 0.
sub24_loss = 0. sub24_loss = 0.
sub124_loss = 0. sub124_loss = 0.
sys.stdout.flush() sys.stdout.flush()
if iter_id % CHECKPOINT_PERIOD == 0: if iter_id % CHECKPOINT_PERIOD == 0 and args.checkpoint_path is not None:
dir_name = args.checkpoint_path + "/" + str(iter_id) dir_name = args.checkpoint_path + "/" + str(iter_id)
fluid.io.save_persistables(exe, dirname=dir_name) fluid.io.save_persistables(exe, dirname=dir_name)
print "Saved checkpoint: %s" % (dir_name) print "Saved checkpoint: %s" % (dir_name)
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# This file is only used for continuous evaluation. # This file is only used for continuous evaluation.
cudaid=${object_detection_cudaid:=0} cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py
cudaid=${object_detection_cudaid:=0, 1, 2, 3} cudaid=${object_detection_cudaid:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py
...@@ -10,14 +10,14 @@ from kpi import CostKpi, DurationKpi, AccKpi ...@@ -10,14 +10,14 @@ from kpi import CostKpi, DurationKpi, AccKpi
train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC') train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_kpi = AccKpi( train_acc_top5_kpi = AccKpi(
'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC') 'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_kpi = CostKpi('train_cost', 0.3, 0, actived=True, desc='train cost') train_cost_kpi = CostKpi('train_cost', 0.5, 0, actived=True, desc='train cost')
test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC') test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_kpi = AccKpi( test_acc_top5_kpi = AccKpi(
'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC') 'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost') test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost')
train_speed_kpi = AccKpi( train_speed_kpi = AccKpi(
'train_speed', 'train_speed',
0.05, 0.5,
0, 0,
actived=True, actived=True,
unit_repr='seconds/image', unit_repr='seconds/image',
...@@ -36,7 +36,7 @@ test_cost_card4_kpi = CostKpi( ...@@ -36,7 +36,7 @@ test_cost_card4_kpi = CostKpi(
'test_cost_card4', 1.0, 0, actived=True, desc='train cost') 'test_cost_card4', 1.0, 0, actived=True, desc='train cost')
train_speed_card4_kpi = AccKpi( train_speed_card4_kpi = AccKpi(
'train_speed_card4', 'train_speed_card4',
0.05, 0.5,
0, 0,
actived=True, actived=True,
unit_repr='seconds/image', unit_repr='seconds/image',
...@@ -89,6 +89,6 @@ def log_to_ce(log): ...@@ -89,6 +89,6 @@ def log_to_ce(log):
if __name__ == '__main__': if __name__ == '__main__':
log = sys.stdin.read() log = sys.stdin.read()
print("*****") print("*****")
print log print(log)
print("****") print("****")
log_to_ce(log) log_to_ce(log)
...@@ -193,13 +193,16 @@ def train(args, ...@@ -193,13 +193,16 @@ def train(args,
total_time += end_time - start_time total_time += end_time - start_time
train_avg_loss = np.mean(every_pass_loss) train_avg_loss = np.mean(every_pass_loss)
if devices_num == 1: if devices_num == 1:
print ("kpis train_cost %s" % train_avg_loss) print ("kpis train_cost %s" % train_avg_loss)
print ("kpis test_acc %s" % mean_map) print ("kpis test_acc %s" % mean_map)
print ("kpis train_speed %s" % (total_time / epoch_idx)) print ("kpis train_speed %s" % (total_time / epoch_idx))
else: else:
print ("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) print ("kpis train_cost_card%s %s" %
print ("kpis test_acc_card%s %s" % (devices_num, mean_map)) (devices_num, train_avg_loss))
print ("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx)) print ("kpis test_acc_card%s %s" %
(devices_num, mean_map))
print ("kpis train_speed_card%s %f" %
(devices_num, total_time / epoch_idx))
if pass_id % 10 == 0 or pass_id == num_passes - 1: if pass_id % 10 == 0 or pass_id == num_passes - 1:
......
python ctc_train.py --batch_size=128 --total_step=10000 --eval_period=10000 --log_period=10000 --use_gpu=True
python ctc_train.py --batch_size=128 --total_step=10000 -eval_period=10000 --log_period=10000 --use_gpu=True | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
# NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.005, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
train_acc_kpi = AccKpi('train_acc', 0.005, 0, actived=True)
tracking_kpis = [
train_acc_kpi,
train_cost_kpi,
test_acc_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -103,6 +103,10 @@ def train(args, data_reader=ctc_reader): ...@@ -103,6 +103,10 @@ def train(args, data_reader=ctc_reader):
print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % ( print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
time.time(), iter_num, str(test_seq_error[0])) time.time(), iter_num, str(test_seq_error[0]))
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
print "kpis test_acc %f" % (1 - test_seq_error[0])
def save_model(args, exe, iter_num): def save_model(args, exe, iter_num):
filename = "model_%05d" % iter_num filename = "model_%05d" % iter_num
fluid.io.save_params( fluid.io.save_params(
...@@ -111,6 +115,7 @@ def train(args, data_reader=ctc_reader): ...@@ -111,6 +115,7 @@ def train(args, data_reader=ctc_reader):
iter_num = 0 iter_num = 0
stop = False stop = False
start_time = time.time()
while not stop: while not stop:
total_loss = 0.0 total_loss = 0.0
total_seq_error = 0.0 total_seq_error = 0.0
...@@ -139,11 +144,15 @@ def train(args, data_reader=ctc_reader): ...@@ -139,11 +144,15 @@ def train(args, data_reader=ctc_reader):
time.time(), iter_num, time.time(), iter_num,
total_loss / (args.log_period * args.batch_size), total_loss / (args.log_period * args.batch_size),
total_seq_error / (args.log_period * args.batch_size)) total_seq_error / (args.log_period * args.batch_size))
print "kpis train_cost %f" % (total_loss / (args.log_period *
args.batch_size))
print "kpis train_acc %f" % (
1 - total_seq_error / (args.log_period * args.batch_size))
sys.stdout.flush() sys.stdout.flush()
total_loss = 0.0 total_loss = 0.0
total_seq_error = 0.0 total_seq_error = 0.0
# evaluate # evaluate
if not args.skip_test and iter_num % args.eval_period == 0: if not args.skip_test and iter_num % args.eval_period == 0:
if model_average: if model_average:
with model_average.apply(exe): with model_average.apply(exe):
...@@ -158,6 +167,8 @@ def train(args, data_reader=ctc_reader): ...@@ -158,6 +167,8 @@ def train(args, data_reader=ctc_reader):
save_model(args, exe, iter_num) save_model(args, exe, iter_num)
else: else:
save_model(args, exe, iter_num) save_model(args, exe, iter_num)
end_time = time.time()
print "kpis train_duration %f" % (end_time - start_time)
# Postprocess benchmark data # Postprocess benchmark data
latencies = batch_times[args.skip_batch_num:] latencies = batch_times[args.skip_batch_num:]
latency_avg = np.average(latencies) latency_avg = np.average(latencies)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册