From 448c59aac7db247909604cd95f5f57dc397c2ea6 Mon Sep 17 00:00:00 2001 From: Paddle CI Date: Thu, 2 Aug 2018 01:07:54 +0800 Subject: [PATCH] language_model_for_ce --- fluid/language_model/.run_ce.sh | 14 +++++++ fluid/language_model/_ce.py | 62 +++++++++++++++++++++++++++++ fluid/language_model/train.py | 70 ++++++++++++++++++++------------- fluid/language_model/utils.py | 3 +- 4 files changed, 119 insertions(+), 30 deletions(-) create mode 100644 fluid/language_model/.run_ce.sh create mode 100644 fluid/language_model/_ce.py diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh new file mode 100644 index 00000000..1a6eb82b --- /dev/null +++ b/fluid/language_model/.run_ce.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +cudaid=${language_model:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py | python _ce.py + +cudaid=${language_model_m:=0,1,2,3} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py | python _ce.py diff --git a/fluid/language_model/_ce.py b/fluid/language_model/_ce.py new file mode 100644 index 00000000..1cc25485 --- /dev/null +++ b/fluid/language_model/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) +imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02, + 0, actived=True) +imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0) +imikolov_20_pass_duration_kpi_card4 = DurationKpi('imikolov_20_pass_duration_card4', 0.03, + 0, actived=True) + +tracking_kpis = [ + imikolov_20_avg_ppl_kpi, + imikolov_20_pass_duration_kpi, + imikolov_20_avg_ppl_kpi_card4, + imikolov_20_pass_duration_kpi_card4, +] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print (fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print (kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) + diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py index 59fc3a98..fdbfbe81 100644 --- a/fluid/language_model/train.py +++ b/fluid/language_model/train.py @@ -1,3 +1,4 @@ +import os import sys import time @@ -5,10 +6,12 @@ import numpy as np import math import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import utils +# random seed must set before configuring the network. +fluid.default_startup_program().random_seed = 102 def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): """ network definition """ @@ -65,29 +68,19 @@ def train(train_reader, """ train network """ vocab_size = len(vocab) + #Input data src_wordseq = fluid.layers.data( name="src_wordseq", shape=[1], dtype="int64", lod_level=1) dst_wordseq = fluid.layers.data( name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + # Train program avg_cost = None - if not parallel: - cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, + cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) - else: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) - with pd.do(): - cost = network( - pd.read_input(src_wordseq), - pd.read_input(dst_wordseq), vocab_size, hid_size, - init_low_bound, init_high_bound) - pd.write_output(cost) - - cost = pd() - avg_cost = fluid.layers.mean(x=cost) + avg_cost = fluid.layers.mean(x=cost) + # Optimization to minimize lost sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=base_lr, @@ -96,53 +89,74 @@ def train(train_reader, staircase=True)) sgd_optimizer.minimize(avg_cost) + # Initialize executor place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + total_time = 0.0 + fetch_list=[avg_cost.name] for pass_idx in xrange(pass_num): epoch_idx = pass_idx + 1 print "epoch_%d start" % epoch_idx t0 = time.time() i = 0 + newest_ppl = 0 for data in train_reader(): i += 1 lod_src_wordseq = utils.to_lodtensor( map(lambda x: x[0], data), place) lod_dst_wordseq = utils.to_lodtensor( map(lambda x: x[1], data), place) - ret_avg_cost = exe.run(fluid.default_main_program(), + ret_avg_cost = train_exe.run( feed={ "src_wordseq": lod_src_wordseq, "dst_wordseq": lod_dst_wordseq }, - fetch_list=[avg_cost], - use_program_cache=True) - avg_ppl = math.exp(ret_avg_cost[0]) + fetch_list=fetch_list) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) if i % 100 == 0: - print "step:%d ppl:%.3f" % (i, avg_ppl) + print "step:%d ppl:%.3f" % (i, newest_ppl) t1 = time.time() total_time += t1 - t0 - print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, - total_time / epoch_idx) - + print "epoch:%d num_steps:%d time_cost(s):%f" % ( + epoch_idx, i, total_time / epoch_idx) + + if pass_idx == pass_num - 1: + #Note: The following logs are special for CE monitoring. + #Other situations do not need to care about these logs. + gpu_num = get_cards() + if gpu_num == 1: + print("kpis imikolov_20_pass_duration %s" % (total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl %s" % newest_ppl) + else: + print("kpis imikolov_20_pass_duration_card%s %s" % \ + (gpu_num, total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl_card%s %s" % (gpu_num, newest_ppl)) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["src_wordseq", "dst_wordseq"] fetch_vars = [avg_cost] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, + exe) print("model saved in %s" % save_dir) print("finish training") +def get_cards(): + cards = os.environ.get('CUDA_VISIBLE_DEVICES') + num = len(cards.split(",")) + return num def train_net(): """ do training """ batch_size = 20 vocab, train_reader, test_reader = utils.prepare_data( - batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) + batch_size=batch_size * get_cards(), buffer_size=1000, word_freq_threshold=0) train( train_reader=train_reader, vocab=vocab, @@ -152,7 +166,7 @@ def train_net(): batch_size=batch_size, pass_num=12, use_cuda=True, - parallel=False, + parallel=True, model_dir="model", init_low_bound=-0.1, init_high_bound=0.1) diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py index c5909046..59803cc1 100644 --- a/fluid/language_model/utils.py +++ b/fluid/language_model/utils.py @@ -3,8 +3,7 @@ import time import numpy as np import paddle.fluid as fluid -import paddle.v2 as paddle - +import paddle def to_lodtensor(data, place): """ convert to LODtensor """ -- GitLab