diff --git a/fluid/icnet/.run_ce.sh b/fluid/icnet/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..a46081c7978395697b843c5fef95e6091b47e4e5 --- /dev/null +++ b/fluid/icnet/.run_ce.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. + +rm -rf *_factor.txt +python train.py --use_gpu=True 1> log +cat log | python _ce.py diff --git a/fluid/icnet/_ce.py b/fluid/icnet/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..3844eefde620f9587d747594ad0d5351999859c8 --- /dev/null +++ b/fluid/icnet/_ce.py @@ -0,0 +1,57 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +# NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True) + +tracking_kpis = [ + train_cost_kpi, + train_duration_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/icnet/eval.py b/fluid/icnet/eval.py index d3253c3cb63b8bb58d8a1bdad3318de1c1441142..bdebe7ad72d799e709bd529711d600a9d692a838 100644 --- a/fluid/icnet/eval.py +++ b/fluid/icnet/eval.py @@ -20,12 +20,12 @@ add_arg('use_gpu', bool, True, "Whether use GPU to test.") def cal_mean_iou(wrong, correct): - sum = wrong + cerroct + sum = wrong + correct true_num = (sum != 0).sum() - for i in len(sum): + for i in range(len(sum)): if sum[i] == 0: sum[i] = 1 - return (cerroct.astype("float64") / sum).sum() / true_num + return (correct.astype("float64") / sum).sum() / true_num def create_iou(predict, label, mask, num_classes, image_shape): @@ -84,6 +84,7 @@ def eval(args): sys.stdout.flush() iou = cal_mean_iou(out_wrong, out_right) print "\nmean iou: %.3f" % iou + print "kpis test_acc %f" % iou def main(): diff --git a/fluid/icnet/icnet.py b/fluid/icnet/icnet.py index 14eaa5fa25c8570cc8747842333c7ca72f104fd1..afe3fa9d352bd8fbf6b2fad46f24ad4c9841a1ff 100644 --- a/fluid/icnet/icnet.py +++ b/fluid/icnet/icnet.py @@ -184,7 +184,7 @@ def res_block(input, filter_num, padding=0, dilation=None, name=None): tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase") tmp = bn(tmp, relu=False) tmp = input + tmp - tmp = fluid.layers.relu(tmp, name=name + "_relu") + tmp = fluid.layers.relu(tmp) return tmp @@ -227,7 +227,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1, tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase") tmp = bn(tmp, relu=False) tmp = proj_bn + tmp - tmp = fluid.layers.relu(tmp, name=name + "_relu") + tmp = fluid.layers.relu(tmp) return tmp diff --git a/fluid/icnet/train.py b/fluid/icnet/train.py index 298a2113a15614641d573551e67006f9abbe751a..b38f08258b9b3e1bd28d808b2779416259f9d827 100644 --- a/fluid/icnet/train.py +++ b/fluid/icnet/train.py @@ -11,6 +11,10 @@ from utils import add_arguments, print_arguments, get_feeder_data from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter from paddle.fluid.initializer import init_on_cpu +SEED = 90 +# random seed must set before configuring the network. +fluid.default_startup_program().random_seed = SEED + parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable @@ -27,9 +31,9 @@ LAMBDA2 = 0.4 LAMBDA3 = 1.0 LEARNING_RATE = 0.003 POWER = 0.9 -LOG_PERIOD = 1 -CHECKPOINT_PERIOD = 1000 -TOTAL_STEP = 60000 +LOG_PERIOD = 100 +CHECKPOINT_PERIOD = 100 +TOTAL_STEP = 100 no_grad_set = [] @@ -97,10 +101,13 @@ def train(args): sub124_loss = 0. train_reader = cityscape.train( args.batch_size, flip=args.random_mirror, scaling=args.random_scaling) + start_time = time.time() while True: # train a pass for data in train_reader(): if iter_id > TOTAL_STEP: + end_time = time.time() + print "kpis train_duration %f" % (end_time - start_time) return iter_id += 1 results = exe.run( @@ -115,13 +122,15 @@ def train(args): print "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % ( iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD, sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD) + print "kpis train_cost %f" % (t_loss / LOG_PERIOD) + t_loss = 0. sub4_loss = 0. sub24_loss = 0. sub124_loss = 0. sys.stdout.flush() - if iter_id % CHECKPOINT_PERIOD == 0: + if iter_id % CHECKPOINT_PERIOD == 0 and args.checkpoint_path is not None: dir_name = args.checkpoint_path + "/" + str(iter_id) fluid.io.save_persistables(exe, dirname=dir_name) print "Saved checkpoint: %s" % (dir_name) diff --git a/fluid/image_classification/.run_ce.sh b/fluid/image_classification/.run_ce.sh old mode 100644 new mode 100755 index e54fcc8b61d4e01befe7fa845a3bc7a5e315dc94..f06fdf769024aef68ab5278fafb956f859228038 --- a/fluid/image_classification/.run_ce.sh +++ b/fluid/image_classification/.run_ce.sh @@ -3,8 +3,8 @@ # This file is only used for continuous evaluation. cudaid=${object_detection_cudaid:=0} export CUDA_VISIBLE_DEVICES=$cudaid -python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py +python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py cudaid=${object_detection_cudaid:=0, 1, 2, 3} export CUDA_VISIBLE_DEVICES=$cudaid -python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py +python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py diff --git a/fluid/image_classification/_ce.py b/fluid/image_classification/_ce.py index 9905ecd8043deb5bc040b6b2de481f65bae7c914..0030bed1759390f2dad0843d10488f91b04f42b7 100644 --- a/fluid/image_classification/_ce.py +++ b/fluid/image_classification/_ce.py @@ -10,14 +10,14 @@ from kpi import CostKpi, DurationKpi, AccKpi train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC') train_acc_top5_kpi = AccKpi( 'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC') -train_cost_kpi = CostKpi('train_cost', 0.3, 0, actived=True, desc='train cost') +train_cost_kpi = CostKpi('train_cost', 0.5, 0, actived=True, desc='train cost') test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC') test_acc_top5_kpi = AccKpi( 'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC') test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost') train_speed_kpi = AccKpi( 'train_speed', - 0.05, + 0.5, 0, actived=True, unit_repr='seconds/image', @@ -36,7 +36,7 @@ test_cost_card4_kpi = CostKpi( 'test_cost_card4', 1.0, 0, actived=True, desc='train cost') train_speed_card4_kpi = AccKpi( 'train_speed_card4', - 0.05, + 0.5, 0, actived=True, unit_repr='seconds/image', @@ -89,6 +89,6 @@ def log_to_ce(log): if __name__ == '__main__': log = sys.stdin.read() print("*****") - print log + print(log) print("****") log_to_ce(log) diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index aadcc904f55f077c06630a1f8e27a6bf4b422c05..46af235ff7f6c3067e1cc2d35de76ebaf59be885 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -193,13 +193,16 @@ def train(args, total_time += end_time - start_time train_avg_loss = np.mean(every_pass_loss) if devices_num == 1: - print ("kpis train_cost %s" % train_avg_loss) - print ("kpis test_acc %s" % mean_map) - print ("kpis train_speed %s" % (total_time / epoch_idx)) + print ("kpis train_cost %s" % train_avg_loss) + print ("kpis test_acc %s" % mean_map) + print ("kpis train_speed %s" % (total_time / epoch_idx)) else: - print ("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) - print ("kpis test_acc_card%s %s" % (devices_num, mean_map)) - print ("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx)) + print ("kpis train_cost_card%s %s" % + (devices_num, train_avg_loss)) + print ("kpis test_acc_card%s %s" % + (devices_num, mean_map)) + print ("kpis train_speed_card%s %f" % + (devices_num, total_time / epoch_idx)) if pass_id % 10 == 0 or pass_id == num_passes - 1: diff --git a/fluid/ocr_recognition/.run.sh b/fluid/ocr_recognition/.run.sh new file mode 100644 index 0000000000000000000000000000000000000000..6fd313d34afc8441fa8256e909991263317f1dc6 --- /dev/null +++ b/fluid/ocr_recognition/.run.sh @@ -0,0 +1 @@ +python ctc_train.py --batch_size=128 --total_step=10000 --eval_period=10000 --log_period=10000 --use_gpu=True diff --git a/fluid/ocr_recognition/.run_ce.sh b/fluid/ocr_recognition/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..f7773d320549cf7d3ffda5810c0699ff88e0e1f3 --- /dev/null +++ b/fluid/ocr_recognition/.run_ce.sh @@ -0,0 +1 @@ +python ctc_train.py --batch_size=128 --total_step=10000 -eval_period=10000 --log_period=10000 --use_gpu=True | python _ce.py diff --git a/fluid/ocr_recognition/_ce.py b/fluid/ocr_recognition/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..365639f6dac48862fbd3d5f6da32a16c4135a20b --- /dev/null +++ b/fluid/ocr_recognition/_ce.py @@ -0,0 +1,61 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +# NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.005, 0, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True) +train_acc_kpi = AccKpi('train_acc', 0.005, 0, actived=True) + +tracking_kpis = [ + train_acc_kpi, + train_cost_kpi, + test_acc_kpi, + train_duration_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py index d2d79d78f717959e0a5c763e7d6b5d8fcf9f9bdc..757fd1a97250616e0eed8f1ffa7911f3dd551f96 100644 --- a/fluid/ocr_recognition/ctc_train.py +++ b/fluid/ocr_recognition/ctc_train.py @@ -103,6 +103,10 @@ def train(args, data_reader=ctc_reader): print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % ( time.time(), iter_num, str(test_seq_error[0])) + #Note: The following logs are special for CE monitoring. + #Other situations do not need to care about these logs. + print "kpis test_acc %f" % (1 - test_seq_error[0]) + def save_model(args, exe, iter_num): filename = "model_%05d" % iter_num fluid.io.save_params( @@ -111,6 +115,7 @@ def train(args, data_reader=ctc_reader): iter_num = 0 stop = False + start_time = time.time() while not stop: total_loss = 0.0 total_seq_error = 0.0 @@ -139,11 +144,15 @@ def train(args, data_reader=ctc_reader): time.time(), iter_num, total_loss / (args.log_period * args.batch_size), total_seq_error / (args.log_period * args.batch_size)) + print "kpis train_cost %f" % (total_loss / (args.log_period * + args.batch_size)) + print "kpis train_acc %f" % ( + 1 - total_seq_error / (args.log_period * args.batch_size)) sys.stdout.flush() total_loss = 0.0 total_seq_error = 0.0 - # evaluate +# evaluate if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): @@ -158,6 +167,8 @@ def train(args, data_reader=ctc_reader): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) + end_time = time.time() + print "kpis train_duration %f" % (end_time - start_time) # Postprocess benchmark data latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies)