diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..5ee2d8aa0582b2b8504f9ba645b6252aa75f23bf --- /dev/null +++ b/fluid/language_model/.run_ce.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +cudaid=${language_model:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --enable_ce | python _ce.py + +cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --enable_ce | python _ce.py diff --git a/fluid/language_model/_ce.py b/fluid/language_model/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..d4999d7a1e14e333f1c7056b3dc2c5b506682ec6 --- /dev/null +++ b/fluid/language_model/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) +imikolov_20_pass_duration_kpi = DurationKpi( + 'imikolov_20_pass_duration', 0.02, 0, actived=True) +imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0) +imikolov_20_pass_duration_kpi_card4 = DurationKpi( + 'imikolov_20_pass_duration_card4', 0.03, 0, actived=True) + +tracking_kpis = [ + imikolov_20_avg_ppl_kpi, + imikolov_20_pass_duration_kpi, + imikolov_20_avg_ppl_kpi_card4, + imikolov_20_pass_duration_kpi_card4, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py index 59fc3a987746af7aec9b61b5c817400b6b6546d0..f3e7a7398bf13e14c74ce1d10d90b7bf34031698 100644 --- a/fluid/language_model/train.py +++ b/fluid/language_model/train.py @@ -1,14 +1,28 @@ +import os import sys import time import numpy as np import math - +import argparse import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import utils +SEED = 102 + + +def parse_args(): + parser = argparse.ArgumentParser("language_model benchmark.") + parser.add_argument( + '--enable_ce', + action='store_true', + help='If set, run \ + the task with continuous evaluation logs.') + args = parser.parse_args() + return args + def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): """ network definition """ @@ -63,31 +77,26 @@ def train(train_reader, init_low_bound=-0.04, init_high_bound=0.04): """ train network """ + + args = parse_args() + if args.enable_ce: + # random seed must set before configuring the network. + fluid.default_startup_program().random_seed = SEED vocab_size = len(vocab) + #Input data src_wordseq = fluid.layers.data( name="src_wordseq", shape=[1], dtype="int64", lod_level=1) dst_wordseq = fluid.layers.data( name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + # Train program avg_cost = None - if not parallel: - cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) - else: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) - with pd.do(): - cost = network( - pd.read_input(src_wordseq), - pd.read_input(dst_wordseq), vocab_size, hid_size, - init_low_bound, init_high_bound) - pd.write_output(cost) - - cost = pd() - avg_cost = fluid.layers.mean(x=cost) + cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, + init_low_bound, init_high_bound) + avg_cost = fluid.layers.mean(x=cost) + # Optimization to minimize lost sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=base_lr, @@ -96,39 +105,56 @@ def train(train_reader, staircase=True)) sgd_optimizer.minimize(avg_cost) + # Initialize executor place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + total_time = 0.0 + fetch_list = [avg_cost.name] for pass_idx in xrange(pass_num): epoch_idx = pass_idx + 1 print "epoch_%d start" % epoch_idx t0 = time.time() i = 0 + newest_ppl = 0 for data in train_reader(): i += 1 lod_src_wordseq = utils.to_lodtensor( map(lambda x: x[0], data), place) lod_dst_wordseq = utils.to_lodtensor( map(lambda x: x[1], data), place) - ret_avg_cost = exe.run(fluid.default_main_program(), - feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=[avg_cost], - use_program_cache=True) - avg_ppl = math.exp(ret_avg_cost[0]) + ret_avg_cost = train_exe.run(feed={ + "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq + }, + fetch_list=fetch_list) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) if i % 100 == 0: - print "step:%d ppl:%.3f" % (i, avg_ppl) + print "step:%d ppl:%.3f" % (i, newest_ppl) t1 = time.time() total_time += t1 - t0 print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, total_time / epoch_idx) + if pass_idx == pass_num - 1 and args.enable_ce: + #Note: The following logs are special for CE monitoring. + #Other situations do not need to care about these logs. + gpu_num = get_cards() + if gpu_num == 1: + print("kpis imikolov_20_pass_duration %s" % + (total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl %s" % newest_ppl) + else: + print("kpis imikolov_20_pass_duration_card%s %s" % \ + (gpu_num, total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl_card%s %s" % + (gpu_num, newest_ppl)) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["src_wordseq", "dst_wordseq"] fetch_vars = [avg_cost] @@ -138,11 +164,22 @@ def train(train_reader, print("finish training") +def get_cards(enable_ce): + if enable_ce: + cards = os.environ.get('CUDA_VISIBLE_DEVICES') + num = len(cards.split(",")) + return num + else: + return fluid.core.get_cuda_device_count() + + def train_net(): """ do training """ batch_size = 20 + args = parse_args() vocab, train_reader, test_reader = utils.prepare_data( - batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) + batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \ + word_freq_threshold=0, enable_ce = args.enable_ce) train( train_reader=train_reader, vocab=vocab, @@ -152,7 +189,7 @@ def train_net(): batch_size=batch_size, pass_num=12, use_cuda=True, - parallel=False, + parallel=True, model_dir="model", init_low_bound=-0.1, init_high_bound=0.1) diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py index c5909046176586556a2aedba5dd5d12810b3ea8d..dd03a89835e620dc8432a6ca16392fc5173a12d4 100644 --- a/fluid/language_model/utils.py +++ b/fluid/language_model/utils.py @@ -3,7 +3,7 @@ import time import numpy as np import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle def to_lodtensor(data, place): @@ -22,17 +22,28 @@ def to_lodtensor(data, place): return res -def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): +def prepare_data(batch_size, + buffer_size=1000, + word_freq_threshold=0, + enable_ce=False): """ prepare the English Pann Treebank (PTB) data """ vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) - train_reader = paddle.batch( - paddle.reader.shuffle( + if enable_ce: + train_reader = paddle.batch( paddle.dataset.imikolov.train( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), - buf_size=buffer_size), - batch_size) + batch_size) + else: + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imikolov.train( + vocab, + buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + buf_size=buffer_size), + batch_size) test_reader = paddle.batch( paddle.dataset.imikolov.test( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), diff --git a/fluid/neural_machine_translation/rnn_search/_ce.py b/fluid/neural_machine_translation/rnn_search/_ce.py index 32b13c66a2bbb5bed7e71c7924dea9da4e4182fe..e948336e82141c4a2072a02f73b51cb7b4396ca0 100644 --- a/fluid/neural_machine_translation/rnn_search/_ce.py +++ b/fluid/neural_machine_translation/rnn_search/_ce.py @@ -7,9 +7,9 @@ from kpi import CostKpi, DurationKpi, AccKpi #### NOTE kpi.py should shared in models in some way!!!! -train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) -test_cost_kpi = CostKpi('test_cost', 0.005, actived=True) -train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True) +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True) tracking_kpis = [ train_cost_kpi, diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py index 989d8bb686a561f4903b4346cd0bb9b29a23e8e7..ade0dd751af1a2e83bb99da22281061dce44fbd1 100644 --- a/fluid/neural_machine_translation/rnn_search/train.py +++ b/fluid/neural_machine_translation/rnn_search/train.py @@ -151,9 +151,9 @@ def train(): # This log is for continuous evaluation only if args.enable_ce: - print("kpis train_cost %f" % avg_cost_train) - print("kpis test_cost %f" % test_loss) - print("kpis train_duration %f" % time_consumed) + print("kpis\ttrain_cost\t%f" % avg_cost_train) + print("kpis\ttest_cost\t%f" % test_loss) + print("kpis\ttrain_duration\t%f" % time_consumed) if pass_id % args.save_interval == 0: model_path = os.path.join(args.save_dir, str(pass_id)) diff --git a/fluid/object_detection/.move.sh b/fluid/object_detection/.move.sh deleted file mode 100644 index d27f72663727d89e487a3eac9eec110818b28a97..0000000000000000000000000000000000000000 --- a/fluid/object_detection/.move.sh +++ /dev/null @@ -1 +0,0 @@ -cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc diff --git a/fluid/object_detection/.run.sh b/fluid/object_detection/.run.sh deleted file mode 100644 index 7be97aaf3dfa5344e872e8d12b0ff0d8f5405df3..0000000000000000000000000000000000000000 --- a/fluid/object_detection/.run.sh +++ /dev/null @@ -1,11 +0,0 @@ -export MKL_NUM_THREADS=1 -export OMP_NUM_THREADS=1 -cudaid=${object_detection_cudaid:=0} # use 0-th card as default -export CUDA_VISIBLE_DEVICES=$cudaid - -if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then - mkdir -p /root/.cache/paddle/dataset/pascalvoc - ./data/pascalvoc/download.sh - bash ./.move.sh -fi -FLAGS_benchmark=true python train.py --batch_size=64 --num_passes=2 --for_model_ce=True --data_dir=/root/.cache/paddle/dataset/pascalvoc/ diff --git a/fluid/object_detection/.run_ce.sh b/fluid/object_detection/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..50809e77043e0eb0bb5f6bf5a9904d8113c85756 --- /dev/null +++ b/fluid/object_detection/.run_ce.sh @@ -0,0 +1,19 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then + mkdir -p /root/.cache/paddle/dataset/pascalvoc + ./data/pascalvoc/download.sh + cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc +fi + +cudaid=${object_detection_cudaid:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py + +cudaid=${object_detection_cudaid:=0,1,2,3} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py diff --git a/fluid/object_detection/_ce.py b/fluid/object_detection/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..4f17ff324d8c4bb1d0cecca2401e584a7ec5e3af --- /dev/null +++ b/fluid/object_detection/_ce.py @@ -0,0 +1,72 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True) +train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True) +train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True) +test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True) +train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True) + +tracking_kpis = [ + train_cost_kpi, + test_acc_kpi, + train_speed_kpi, + train_cost_card4_kpi, + test_acc_card4_kpi, + train_speed_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + #kpi_map = {} + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + #kpi_map[kpi_name] = kpi_value + yield kpi_name, kpi_value + #return kpi_map + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py index c39883196056aede5d410554e14a0198e540d754..b87c0558447397e0a5b6a7a1e689a316d1ee8e14 100644 --- a/fluid/object_detection/mobilenet_ssd.py +++ b/fluid/object_detection/mobilenet_ssd.py @@ -1,4 +1,3 @@ -import paddle.v2 as paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA from paddle.fluid.param_attr import ParamAttr diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index 24225ab31d063cd55df86d7a31d46434578b4b6a..aadcc904f55f077c06630a1f8e27a6bf4b422c05 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -23,7 +23,7 @@ add_arg('dataset', str, 'pascalvoc', "coco2014, coco2017, and pascalv add_arg('model_save_dir', str, 'model', "The path to save model.") add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.") add_arg('apply_distort', bool, True, "Whether apply distort.") -add_arg('apply_expand', bool, True, "Whether appley expand.") +add_arg('apply_expand', bool, True, "Whether apply expand.") add_arg('nms_threshold', float, 0.45, "NMS threshold.") add_arg('ap_version', str, '11point', "integral, 11point.") add_arg('resize_h', int, 300, "The resized image height.") @@ -32,10 +32,8 @@ add_arg('mean_value_B', float, 127.5, "Mean value for B channel which will add_arg('mean_value_G', float, 127.5, "Mean value for G channel which will be subtracted.") #116.78 add_arg('mean_value_R', float, 127.5, "Mean value for R channel which will be subtracted.") #103.94 add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample.") -add_arg('for_model_ce', bool, False, "Use CE to evaluate the model") add_arg('data_dir', str, 'data/pascalvoc', "data directory") -add_arg('skip_batch_num', int, 5, "the num of minibatch to skip.") -add_arg('iterations', int, 120, "mini batchs.") +add_arg('enable_ce', bool, False, "Whether use CE to evaluate the model") #yapf: enable @@ -48,6 +46,9 @@ def train(args, num_passes, model_save_dir, pretrained_model=None): + if args.enable_ce: + fluid.framework.default_startup_program().random_seed = 111 + image_shape = [3, data_args.resize_h, data_args.resize_w] if 'coco' in data_args.dataset: num_classes = 91 @@ -121,8 +122,12 @@ def train(args, train_exe = fluid.ParallelExecutor( use_cuda=args.use_gpu, loss_name=loss.name) - train_reader = paddle.batch( - reader.train(data_args, train_file_list), batch_size=batch_size) + if not args.enable_ce: + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + else: + train_reader = paddle.batch( + reader.train(data_args, train_file_list, False), batch_size=batch_size) test_reader = paddle.batch( reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder( @@ -140,32 +145,32 @@ def train(args, def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) + every_pass_map=[] for batch_id, data in enumerate(test_reader()): test_map, = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if batch_id % 20 == 0: + every_pass_map.append(test_map) print("Batch {0}, map {1}".format(batch_id, test_map)) + mean_map = np.mean(every_pass_map) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Pass {0}, test map {1}".format(pass_id, test_map)) - return best_map + return best_map, mean_map - train_num = 0 - total_train_time = 0.0 + total_time = 0.0 for pass_id in range(num_passes): + epoch_idx = pass_id + 1 start_time = time.time() prev_start_time = start_time - # end_time = 0 every_pass_loss = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() - if args.for_model_ce and iter == args.iterations: - break if len(data) < (devices_num * 2): print("There are too few data to train on all devices.") continue @@ -176,34 +181,31 @@ def train(args, loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) - # end_time = time.time() loss_v = np.mean(np.array(loss_v)) + every_pass_loss.append(loss_v) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, start_time - prev_start_time)) - if args.for_model_ce and iter >= args.skip_batch_num or pass_id != 0: - batch_duration = time.time() - start_time - pass_duration += batch_duration - train_num += len(data) - every_pass_loss.append(loss_v) - iter += 1 - total_train_time += pass_duration - - if args.for_model_ce and pass_id == num_passes - 1: - examples_per_sec = train_num / total_train_time - cost = np.mean(every_pass_loss) - with open("train_speed_factor.txt", 'w') as f: - f.write('{:f}\n'.format(examples_per_sec)) - with open("train_cost_factor.txt", 'a+') as f: - f.write('{:f}\n'.format(cost)) - - best_map = test(pass_id, best_map) + end_time = time.time() + best_map, mean_map = test(pass_id, best_map) + if args.enable_ce and pass_id == 1: + total_time += end_time - start_time + train_avg_loss = np.mean(every_pass_loss) + if devices_num == 1: + print ("kpis train_cost %s" % train_avg_loss) + print ("kpis test_acc %s" % mean_map) + print ("kpis train_speed %s" % (total_time / epoch_idx)) + else: + print ("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) + print ("kpis test_acc_card%s %s" % (devices_num, mean_map)) + print ("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx)) + + if pass_id % 10 == 0 or pass_id == num_passes - 1: save_model(str(pass_id)) print("Best test map {0}".format(best_map)) - if __name__ == '__main__': args = parser.parse_args() print_arguments(args) diff --git a/fluid/sequence_tagging_for_ner/train.py b/fluid/sequence_tagging_for_ner/train.py index 6ed77cd5ca1d504a8b79b4f87349242b5051c539..8aaf660f35aa63b7d298d1d380b579a66f421de7 100644 --- a/fluid/sequence_tagging_for_ner/train.py +++ b/fluid/sequence_tagging_for_ner/train.py @@ -53,7 +53,7 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) - inference_program = fluid.default_main_program().clone() + inference_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(inference_program): test_target = chunk_evaluator.metrics + chunk_evaluator.states inference_program = fluid.io.get_inference_program(test_target) diff --git a/fluid/text_classification/.run_ce.sh b/fluid/text_classification/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..777ba02e8c78760452a6fb7d4ac1dc4a82d62594 --- /dev/null +++ b/fluid/text_classification/.run_ce.sh @@ -0,0 +1,5 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + +export CE_MODE_X=1 +python train.py cnn | python _ce.py diff --git a/fluid/text_classification/_ce.py b/fluid/text_classification/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..100357204db7f3a8d0c1d3cbcbdc707410b20023 --- /dev/null +++ b/fluid/text_classification/_ce.py @@ -0,0 +1,48 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_acc_kpi = AccKpi('train_acc', 0.005, actived=True) +train_cost_kpi = CostKpi('train_cost', 0.005, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True) + +tracking_kpis = [ + train_acc_kpi, + train_cost_kpi, + train_duration_kpi, +] + + +def parse_log(log): + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/fluid/text_classification/train.py b/fluid/text_classification/train.py index dc164671e785b758365885b98788fae71d5f8a87..698e4dc0788f2e185810a4f782ac4dcff1f60c81 100644 --- a/fluid/text_classification/train.py +++ b/fluid/text_classification/train.py @@ -1,3 +1,4 @@ +import os import sys import time import unittest @@ -53,8 +54,12 @@ def train(train_reader, exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + # For internal continuous evaluation + if 'CE_MODE_X' in os.environ: + fluid.default_startup_program().random_seed = 110 exe.run(fluid.default_startup_program()) for pass_id in xrange(pass_num): + pass_start = time.time() data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 for data in train_reader(): avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(), @@ -73,6 +78,13 @@ def train(train_reader, epoch_model = save_dirname + "/" + "epoch" + str(pass_id) fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe) + pass_end = time.time() + # For internal continuous evaluation + if 'CE_MODE_X' in os.environ: + print("kpis train_acc %f" % avg_acc) + print("kpis train_cost %f" % avg_cost) + print("kpis train_duration %f" % (pass_end - pass_start)) + def train_net(): word_dict, train_reader, test_reader = utils.prepare_data( diff --git a/fluid/text_classification/utils.py b/fluid/text_classification/utils.py index fba14dde63d27ada07d8fcd69cacfb631559e613..874679c3e2f9fe0c640d6da4f25d503023adcb65 100644 --- a/fluid/text_classification/utils.py +++ b/fluid/text_classification/utils.py @@ -1,3 +1,4 @@ +import os import sys import time import numpy as np @@ -64,15 +65,22 @@ def prepare_data(data_type="imdb", raise RuntimeError("No such dataset") if data_type == "imdb": - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=buf_size), - batch_size=batch_size) + if 'CE_MODE_X' in os.environ: + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) - test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.test(word_dict), buf_size=buf_size), - batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.imdb.test(word_dict), batch_size=batch_size) + else: + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=buf_size), + batch_size=batch_size) + + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(word_dict), buf_size=buf_size), + batch_size=batch_size) else: raise RuntimeError("no such dataset")