From 0c84765734752fd98e2c44ee5e0799348db915c2 Mon Sep 17 00:00:00 2001 From: guochaorong Date: Tue, 7 Aug 2018 21:01:16 +0800 Subject: [PATCH] set ce flag for language_model --- fluid/language_model/.run_ce.sh | 2 +- fluid/language_model/train.py | 58 ++++++++++++++++++++++----------- fluid/language_model/utils.py | 22 ++++++++++--- 3 files changed, 57 insertions(+), 25 deletions(-) diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh index 1a6eb82b..c0cb8f54 100644 --- a/fluid/language_model/.run_ce.sh +++ b/fluid/language_model/.run_ce.sh @@ -8,7 +8,7 @@ export CUDA_VISIBLE_DEVICES=$cudaid FLAGS_benchmark=true python train.py | python _ce.py -cudaid=${language_model_m:=0,1,2,3} # use 0-th card as default +cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default export CUDA_VISIBLE_DEVICES=$cudaid FLAGS_benchmark=true python train.py | python _ce.py diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py index fdbfbe81..ac8dcd29 100644 --- a/fluid/language_model/train.py +++ b/fluid/language_model/train.py @@ -4,14 +4,25 @@ import time import numpy as np import math - +import argparse import paddle.fluid as fluid import paddle import utils -# random seed must set before configuring the network. -fluid.default_startup_program().random_seed = 102 +SEED = 102 + + +def parse_args(): + parser = argparse.ArgumentParser("language_model benchmark.") + parser.add_argument( + '--enable_ce', + action='store_true', + help='If set, run \ + the task with continuous evaluation logs.') + args = parser.parse_args() + return args + def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): """ network definition """ @@ -66,6 +77,11 @@ def train(train_reader, init_low_bound=-0.04, init_high_bound=0.04): """ train network """ + + args = parse_args() + if args.enable_ce: + # random seed must set before configuring the network. + fluid.default_startup_program().random_seed = SEED vocab_size = len(vocab) #Input data @@ -77,7 +93,7 @@ def train(train_reader, # Train program avg_cost = None cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) + init_low_bound, init_high_bound) avg_cost = fluid.layers.mean(x=cost) # Optimization to minimize lost @@ -97,7 +113,7 @@ def train(train_reader, train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) total_time = 0.0 - fetch_list=[avg_cost.name] + fetch_list = [avg_cost.name] for pass_idx in xrange(pass_num): epoch_idx = pass_idx + 1 print "epoch_%d start" % epoch_idx @@ -111,12 +127,11 @@ def train(train_reader, map(lambda x: x[0], data), place) lod_dst_wordseq = utils.to_lodtensor( map(lambda x: x[1], data), place) - ret_avg_cost = train_exe.run( - feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=fetch_list) + ret_avg_cost = train_exe.run(feed={ + "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq + }, + fetch_list=fetch_list) avg_ppl = np.exp(ret_avg_cost[0]) newest_ppl = np.mean(avg_ppl) if i % 100 == 0: @@ -124,39 +139,44 @@ def train(train_reader, t1 = time.time() total_time += t1 - t0 - print "epoch:%d num_steps:%d time_cost(s):%f" % ( - epoch_idx, i, total_time / epoch_idx) + print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, + total_time / epoch_idx) - if pass_idx == pass_num - 1: + if pass_idx == pass_num - 1 and args.enable_ce: #Note: The following logs are special for CE monitoring. #Other situations do not need to care about these logs. gpu_num = get_cards() if gpu_num == 1: - print("kpis imikolov_20_pass_duration %s" % (total_time / epoch_idx)) + print("kpis imikolov_20_pass_duration %s" % + (total_time / epoch_idx)) print("kpis imikolov_20_avg_ppl %s" % newest_ppl) else: print("kpis imikolov_20_pass_duration_card%s %s" % \ (gpu_num, total_time / epoch_idx)) - print("kpis imikolov_20_avg_ppl_card%s %s" % (gpu_num, newest_ppl)) + print("kpis imikolov_20_avg_ppl_card%s %s" % + (gpu_num, newest_ppl)) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["src_wordseq", "dst_wordseq"] fetch_vars = [avg_cost] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, - exe) + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) print("model saved in %s" % save_dir) print("finish training") + def get_cards(): cards = os.environ.get('CUDA_VISIBLE_DEVICES') num = len(cards.split(",")) return num + def train_net(): """ do training """ batch_size = 20 + args = parse_args() vocab, train_reader, test_reader = utils.prepare_data( - batch_size=batch_size * get_cards(), buffer_size=1000, word_freq_threshold=0) + batch_size=batch_size * get_cards(), buffer_size=1000, \ + word_freq_threshold=0, enable_ce = args.enable_ce) train( train_reader=train_reader, vocab=vocab, diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py index 59803cc1..dd03a898 100644 --- a/fluid/language_model/utils.py +++ b/fluid/language_model/utils.py @@ -5,6 +5,7 @@ import numpy as np import paddle.fluid as fluid import paddle + def to_lodtensor(data, place): """ convert to LODtensor """ seq_lens = [len(seq) for seq in data] @@ -21,17 +22,28 @@ def to_lodtensor(data, place): return res -def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): +def prepare_data(batch_size, + buffer_size=1000, + word_freq_threshold=0, + enable_ce=False): """ prepare the English Pann Treebank (PTB) data """ vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) - train_reader = paddle.batch( - paddle.reader.shuffle( + if enable_ce: + train_reader = paddle.batch( paddle.dataset.imikolov.train( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), - buf_size=buffer_size), - batch_size) + batch_size) + else: + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imikolov.train( + vocab, + buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + buf_size=buffer_size), + batch_size) test_reader = paddle.batch( paddle.dataset.imikolov.test( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), -- GitLab