diff --git a/fluid/PaddleNLP/machine_reading_comprehension/.run_ce.sh b/fluid/PaddleNLP/machine_reading_comprehension/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..22e0a73478b15e08b5e45b8086af9cadc8e74bc3 --- /dev/null +++ b/fluid/PaddleNLP/machine_reading_comprehension/.run_ce.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +DATA_PATH=./data +if [ ! -e $DATA_PATH/demo ] ; then + mkdir -p $DATA_PATH/demo + if [ ! -e $DATA_PATH/demo.tgz ] ; then + cd $DATA_PATH + wget -c --no-check-certificate http://dureader.gz.bcebos.com/demo.tgz + cd - + fi + tar -zxf $DATA_PATH/demo.tgz -C $DATA_PATH/demo +fi + +train(){ +python -u run.py \ + --trainset 'data/demo/search.train.json' \ + --devset 'data/demo/search.dev.json' \ + --testset 'data/demo/search.test.json' \ + --vocab_dir 'data/demo/' \ + --use_gpu true \ + --save_dir ./models \ + --pass_num 1 \ + --learning_rate 0.001 \ + --batch_size 32 \ + --embed_size 300 \ + --hidden_size 150 \ + --max_p_num 5 \ + --max_p_len 500 \ + --max_q_len 60 \ + --max_a_len 200 \ + --drop_rate 0.2 \ + --log_interval 1 \ + --enable_ce \ + --train +} + +cudaid=${single:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +train | python _ce.py + +cudaid=${multi:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +train | python _ce.py diff --git a/fluid/PaddleNLP/machine_reading_comprehension/_ce.py b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..cff13c8722007987a3cd82f1298206248963e45a --- /dev/null +++ b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py @@ -0,0 +1,68 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +#sys.path.insert(0, os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_card1_kpi = CostKpi('train_cost_card1', 0.02, 0, actived=True) +test_cost_card1_kpi = CostKpi('test_cost_card1', 0.005, 0, actived=True) +train_duration_card1_kpi = DurationKpi( + 'train_duration_card1', 0.06, 0, actived=True) +train_cost_card4_kpi = CostKpi('train_cost_card4', 0.01, 0, actived=True) +test_cost_card4_kpi = CostKpi('test_cost_card4', 0.005, 0, actived=True) +train_duration_card4_kpi = DurationKpi( + 'train_duration_card4', 0.06, 0, actived=True) + +tracking_kpis = [ + train_cost_card1_kpi, + test_cost_card1_kpi, + train_duration_card1_kpi, + train_cost_card4_kpi, + test_cost_card4_kpi, + train_duration_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + The suggestion: + each line in the log should be key, value, for example: + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/fluid/PaddleNLP/machine_reading_comprehension/args.py b/fluid/PaddleNLP/machine_reading_comprehension/args.py index e37aad9a929ba2aaa122d328206a040779b4527f..53812252dea9b7855cd09142de89b6f0a1e2ac5c 100644 --- a/fluid/PaddleNLP/machine_reading_comprehension/args.py +++ b/fluid/PaddleNLP/machine_reading_comprehension/args.py @@ -120,5 +120,9 @@ def parse_args(): '--result_name', default='test_result', help='the file name of the results') + parser.add_argument( + "--enable_ce", + action='store_true', + help="If set, run the task with continuous evaluation logs.") args = parser.parse_args() return args diff --git a/fluid/PaddleNLP/machine_reading_comprehension/data/download.sh b/fluid/PaddleNLP/machine_reading_comprehension/data/download.sh index 41f79dd0cb492d95691a2240e807ed613fd11c8d..bcba3c7ed8957ee987aa13f6c99fa60f8c8494b6 100644 --- a/fluid/PaddleNLP/machine_reading_comprehension/data/download.sh +++ b/fluid/PaddleNLP/machine_reading_comprehension/data/download.sh @@ -21,6 +21,7 @@ if [[ -d preprocessed ]] && [[ -d raw ]]; then exit 0 else wget -c --no-check-certificate http://dureader.gz.bcebos.com/dureader_preprocessed.zip + wget -c --no-check-certificate http://dureader.gz.bcebos.com/demo.tgz fi if md5sum --status -c md5sum.txt; then diff --git a/fluid/PaddleNLP/machine_reading_comprehension/dataset.py b/fluid/PaddleNLP/machine_reading_comprehension/dataset.py index 7e50b7e0e2996b58e56e8e725549502bb1dfbe52..3aaf87be9a7b0659fa9e79eb8329911cbea73c55 100644 --- a/fluid/PaddleNLP/machine_reading_comprehension/dataset.py +++ b/fluid/PaddleNLP/machine_reading_comprehension/dataset.py @@ -152,7 +152,7 @@ class BRCDataset(object): batch_data['passage_token_ids'].append(passage_token_ids) batch_data['passage_length'].append( min(len(passage_token_ids), self.max_p_len)) - # record the start passage index of current doc + # record the start passage index of current sample passade_idx_offset = sum(batch_data['passage_num']) batch_data['passage_num'].append(count) gold_passage_offset = 0 diff --git a/fluid/PaddleNLP/machine_reading_comprehension/run.py b/fluid/PaddleNLP/machine_reading_comprehension/run.py index 8871ab9998942d92516035b78d5e327d4e2271ef..dbe3a4b9a296fdaf089d55be3f0c9845422f0ce5 100644 --- a/fluid/PaddleNLP/machine_reading_comprehension/run.py +++ b/fluid/PaddleNLP/machine_reading_comprehension/run.py @@ -248,18 +248,18 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, n_batch_loss / n_batch_cnt))) n_batch_loss = 0.0 n_batch_cnt = 0 - + batch_offset = 0 for idx, batch in enumerate(batch_list): #one batch batch_size = len(batch['raw_data']) - batch_range = match_lod[0][idx * batch_size:(idx + 1) * batch_size + + batch_range = match_lod[0][batch_offset:batch_offset + batch_size + 1] batch_lod = [[batch_range[x], batch_range[x + 1]] for x in range(len(batch_range[:-1]))] - start_prob_batch = start_probs_m[idx * batch_size:(idx + 1) * - batch_size] - end_prob_batch = end_probs_m[idx * batch_size:(idx + 1) * - batch_size] + start_prob_batch = start_probs_m[batch_offset:batch_offset + + batch_size + 1] + end_prob_batch = end_probs_m[batch_offset:batch_offset + batch_size + + 1] for sample, start_prob_inst, end_prob_inst, inst_range in zip( batch['raw_data'], start_prob_batch, end_prob_batch, batch_lod): @@ -284,6 +284,7 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, 'yesno_answers': [] } ref_answers.append(ref) + batch_offset = batch_offset + batch_size result_dir = args.result_dir result_prefix = args.result_name @@ -346,8 +347,9 @@ def train(logger, args): # build model main_program = fluid.Program() startup_prog = fluid.Program() - main_program.random_seed = args.random_seed - startup_prog.random_seed = args.random_seed + if args.enable_ce: + main_program.random_seed = args.random_seed + startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( @@ -405,7 +407,10 @@ def train(logger, args): for pass_id in range(1, args.pass_num + 1): pass_start_time = time.time() pad_id = vocab.get_id(vocab.pad_token) - train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=False) + if args.enable_ce: + train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=False) + else: + train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=True) train_reader = read_multiple(train_reader, dev_count) log_every_n_batch, n_batch_loss = args.log_interval, 0 total_num, total_loss = 0, 0 @@ -420,6 +425,8 @@ def train(logger, args): n_batch_loss += cost_train total_loss += cost_train * args.batch_size * dev_count + if args.enable_ce and batch_id >= 100: + break if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: print_para(main_program, parallel_executor, logger, args) @@ -464,6 +471,14 @@ def train(logger, args): executor=exe, dirname=model_path, main_program=main_program) + if args.enable_ce: # For CE + print("kpis\ttrain_cost_card%d\t%f" % + (dev_count, total_loss / total_num)) + if brc_data.dev_set is not None: + print("kpis\ttest_cost_card%d\t%f" % + (dev_count, eval_loss)) + print("kpis\ttrain_duration_card%d\t%f" % + (dev_count, time_consumed)) def evaluate(logger, args): @@ -481,8 +496,6 @@ def evaluate(logger, args): # build model main_program = fluid.Program() startup_prog = fluid.Program() - main_program.random_seed = args.random_seed - startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( @@ -530,8 +543,6 @@ def predict(logger, args): # build model main_program = fluid.Program() startup_prog = fluid.Program() - main_program.random_seed = args.random_seed - startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( @@ -599,8 +610,9 @@ def prepare(logger, args): if __name__ == '__main__': args = parse_args() - random.seed(args.random_seed) - np.random.seed(args.random_seed) + if args.enable_ce: + random.seed(args.random_seed) + np.random.seed(args.random_seed) logger = logging.getLogger("brc") logger.setLevel(logging.INFO)