diff --git a/01.fit_a_line/.run_ce.sh b/01.fit_a_line/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e96905dd77f8a24f16c229fe5c522a9f5a8c8d5 --- /dev/null +++ b/01.fit_a_line/.run_ce.sh @@ -0,0 +1,4 @@ +#!/bin/bash +#This file is only used for continuous evaluation. +python train.py --enable_ce | python _ce.py + diff --git a/01.fit_a_line/_ce.py b/01.fit_a_line/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..444a2020468f14bd430b626e674d622f26c8ca79 --- /dev/null +++ b/01.fit_a_line/_ce.py @@ -0,0 +1,39 @@ +### This file is only used for continuous evaluation test! +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True, desc='train cost') +test_cost_kpi = CostKpi('test_cost', 0.02, 0, actived=True, desc='test cost') +tracking_kpis = [train_cost_kpi, test_cost_kpi] + + +def parse_log(log): + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/01.fit_a_line/image/prediction_gt.png b/01.fit_a_line/image/prediction_gt.png index 69dee8cb479aa878a4ff10b0bbeb97a4774aa2ac..e029d2a75fee63dd14e53a1b6be29611362293a5 100644 Binary files a/01.fit_a_line/image/prediction_gt.png and b/01.fit_a_line/image/prediction_gt.png differ diff --git a/01.fit_a_line/image/ranges.png b/01.fit_a_line/image/ranges.png index c6a9e182df89a905a922de63dccaeec028616d42..916337f0720ef221851e89456c5c295e2e13445f 100644 Binary files a/01.fit_a_line/image/ranges.png and b/01.fit_a_line/image/ranges.png differ diff --git a/01.fit_a_line/train.py b/01.fit_a_line/train.py index dcae9ac51ce529da5b59b0d34e997c3b2a5d6716..9b4f6e6ba3d115e29107622b018910dda64caac1 100644 --- a/01.fit_a_line/train.py +++ b/01.fit_a_line/train.py @@ -15,6 +15,7 @@ from __future__ import print_function import sys +import argparse import math import numpy @@ -23,6 +24,23 @@ import paddle import paddle.fluid as fluid +def parse_args(): + parser = argparse.ArgumentParser("fit_a_line") + parser.add_argument( + '--enable_ce', + action='store_true', + help="If set, run the task with continuous evaluation logs.") + parser.add_argument( + '--use_gpu', + type=bool, + default=False, + help="Whether to use GPU or not.") + parser.add_argument( + '--num_epochs', type=int, default=100, help="number of epochs.") + args = parser.parse_args() + return args + + # For training test cost def train_test(executor, program, reader, feeder, fetch_list): accumulated = 1 * [0] @@ -52,21 +70,34 @@ def save_result(points1, points2): def main(): batch_size = 20 - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=batch_size) - test_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.uci_housing.test(), buf_size=500), - batch_size=batch_size) + + if args.enable_ce: + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.uci_housing.test(), batch_size=batch_size) + else: + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=batch_size) + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.test(), buf_size=500), + batch_size=batch_size) # feature vector of length 13 x = fluid.layers.data(name='x', shape=[13], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) main_program = fluid.default_main_program() startup_program = fluid.default_startup_program() + if args.enable_ce: + main_program.random_seed = 90 + startup_program.random_seed = 90 + + y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_loss = fluid.layers.mean(cost) @@ -76,13 +107,13 @@ def main(): test_program = main_program.clone(for_test=True) # can use CPU or GPU - use_cuda = False + use_cuda = args.use_gpu place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) # Specify the directory to save the parameters params_dirname = "fit_a_line.inference.model" - num_epochs = 100 + num_epochs = args.num_epochs # main train loop. feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) @@ -126,6 +157,10 @@ def main(): fluid.io.save_inference_model(params_dirname, ['x'], [y_predict], exe) + if args.enable_ce and pass_id == args.num_epochs - 1: + print("kpis\ttrain_cost\t%f" % avg_loss_value[0]) + print("kpis\ttest_cost\t%f" % test_metics[0]) + infer_exe = fluid.Executor(place) inference_scope = fluid.core.Scope() @@ -162,4 +197,5 @@ def main(): if __name__ == '__main__': + args = parse_args() main() diff --git a/02.recognize_digits/.run_ce.sh b/02.recognize_digits/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c5ae210ff5485c4c9266de73614ee0f4c4d6d6e --- /dev/null +++ b/02.recognize_digits/.run_ce.sh @@ -0,0 +1,4 @@ +#!/bin/bash +#This file is only used for continuous evaluation. +python train.py --enable_ce | python _ce.py + diff --git a/02.recognize_digits/_ce.py b/02.recognize_digits/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..512f387872b7cef7f20dc9b549ececa2e8909c76 --- /dev/null +++ b/02.recognize_digits/_ce.py @@ -0,0 +1,39 @@ +### This file is only used for continuous evaluation test! +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import AccKpi + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True, desc='train cost') +test_cost_kpi = CostKpi('test_cost', 0.02, 0, actived=True, desc='test cost') +test_acc_kpi = AccKpi('test_acc', 0.02, 0, actived=True, desc='test acc') +tracking_kpis = [train_cost_kpi, test_cost_kpi, test_acc_kpi] + + +def parse_log(log): + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/02.recognize_digits/train.py b/02.recognize_digits/train.py index 552584598378631f13c90341b8f5a0eec0a2759f..6ebb1775122324ec9b8aff860a101fbbebbcc0f2 100644 --- a/02.recognize_digits/train.py +++ b/02.recognize_digits/train.py @@ -15,13 +15,28 @@ from __future__ import print_function import os +import argparse from PIL import Image import numpy import paddle import paddle.fluid as fluid -BATCH_SIZE = 64 -PASS_NUM = 5 + +def parse_args(): + parser = argparse.ArgumentParser("mnist") + parser.add_argument( + '--enable_ce', + action='store_true', + help="If set, run the task with continuous evaluation logs.") + parser.add_argument( + '--use_gpu', + type=bool, + default=False, + help="Whether to use GPU or not.") + parser.add_argument( + '--num_epochs', type=int, default=5, help="number of epochs.") + args = parser.parse_args() + return args def loss_net(hidden, label): @@ -69,6 +84,23 @@ def train(nn_type, if use_cuda and not fluid.core.is_compiled_with_cuda(): return + startup_program = fluid.default_startup_program() + main_program = fluid.default_main_program() + + if args.enable_ce: + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + startup_program.random_seed = 90 + main_program.random_seed = 90 + else: + train_reader = paddle.batch( + paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') @@ -81,8 +113,7 @@ def train(nn_type, prediction, avg_loss, acc = net_conf(img, label) - test_program = fluid.default_main_program().clone(for_test=True) - + test_program = main_program.clone(for_test=True) optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) @@ -105,15 +136,8 @@ def train(nn_type, exe = fluid.Executor(place) - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500), - batch_size=BATCH_SIZE) - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) - - exe.run(fluid.default_startup_program()) - main_program = fluid.default_main_program() + exe.run(startup_program) epochs = [epoch_id for epoch_id in range(PASS_NUM)] lists = [] @@ -144,6 +168,11 @@ def train(nn_type, model_filename=model_filename, params_filename=params_filename) + if args.enable_ce: + print("kpis\ttrain_cost\t%f" % metrics[0]) + print("kpis\ttest_cost\t%s" % avg_loss_val) + print("kpis\ttest_acc\t%s" % acc_val) + # find the best pass best = sorted(lists, key=lambda list: float(list[1]))[0] print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1])) @@ -210,7 +239,10 @@ def main(use_cuda, nn_type): if __name__ == '__main__': - use_cuda = False + args = parse_args() + BATCH_SIZE = 64 + PASS_NUM = args.num_epochs + use_cuda = args.use_gpu # predict = 'softmax_regression' # uncomment for Softmax # predict = 'multilayer_perceptron' # uncomment for MLP predict = 'convolutional_neural_network' # uncomment for LeNet5 diff --git a/04.word2vec/.run_ce.sh b/04.word2vec/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c5ae210ff5485c4c9266de73614ee0f4c4d6d6e --- /dev/null +++ b/04.word2vec/.run_ce.sh @@ -0,0 +1,4 @@ +#!/bin/bash +#This file is only used for continuous evaluation. +python train.py --enable_ce | python _ce.py + diff --git a/04.word2vec/_ce.py b/04.word2vec/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..da2a3722cef6667af5f2bddf84cbe97ebc84996e --- /dev/null +++ b/04.word2vec/_ce.py @@ -0,0 +1,36 @@ +### This file is only used for continuous evaluation test! +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True, desc='train cost') +tracking_kpis = [train_cost_kpi] + + +def parse_log(log): + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/04.word2vec/train.py b/04.word2vec/train.py index f296768324917ea9f7affaeea3c3b08683914a10..eebf62234ff408e24159e6bf6895ca5f85ef1a2d 100644 --- a/04.word2vec/train.py +++ b/04.word2vec/train.py @@ -18,19 +18,31 @@ import six import numpy import sys import math +import argparse EMBED_SIZE = 32 HIDDEN_SIZE = 256 N = 5 BATCH_SIZE = 100 -PASS_NUM = 100 - -use_cuda = False # set to True if training with GPU word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) +def parse_args(): + parser = argparse.ArgumentParser("word2vec") + parser.add_argument( + '--enable_ce', + action='store_true', + help='If set, run the task with continuous evaluation logs.') + parser.add_argument( + '--use_gpu', type=int, default=0, help='whether to use gpu') + parser.add_argument( + '--num_epochs', type=int, default=100, help='number of epoch') + args = parser.parse_args() + return args + + def inference_program(words, is_sparse): embed_first = fluid.layers.embedding( @@ -102,6 +114,10 @@ def train(if_use_cuda, params_dirname, is_sparse=True): main_program = fluid.default_main_program() star_program = fluid.default_startup_program() + if args.enable_ce: + main_program.random_seed = 90 + star_program.random_seed = 90 + predict_word = inference_program(word_list, is_sparse) avg_cost = train_program(predict_word) test_program = main_program.clone(for_test=True) @@ -153,6 +169,9 @@ def train(if_use_cuda, params_dirname, is_sparse=True): # Note 5.8 is a relatively high value. In order to get a better model, one should # aim for avg_cost lower than 3.5. But the training could take longer time. if outs[0] < 5.8: + if args.enable_ce: + print("kpis\ttrain_cost\t%f" % outs[0]) + if params_dirname is not None: fluid.io.save_inference_model(params_dirname, [ 'firstw', 'secondw', 'thirdw', 'fourthw' @@ -161,7 +180,6 @@ def train(if_use_cuda, params_dirname, is_sparse=True): step += 1 if math.isnan(float(avg_cost_np[0])): sys.exit("got NaN loss, training failed.") - raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) train_loop() @@ -245,4 +263,7 @@ def main(use_cuda, is_sparse): if __name__ == '__main__': + args = parse_args() + PASS_NUM = args.num_epochs + use_cuda = args.use_gpu # set to True if training with GPU main(use_cuda=use_cuda, is_sparse=True)