提交 b362e71a 编写于 作者: G guochaorong

support ce for language_model

上级 53937db0
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
imikolov_20_pass_duration_kpi = DurationKpi('imikolov_20_pass_duration', 0.02,
0, actived=True)
imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
imikolov_20_pass_duration_kpi_card4 = DurationKpi('imikolov_20_pass_duration_card4', 0.03,
0, actived=True)
tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
imikolov_20_avg_ppl_kpi_card4,
imikolov_20_pass_duration_kpi_card4,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print (fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print (kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
import os
import sys
import time
......@@ -5,10 +6,12 @@ import numpy as np
import math
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle
import utils
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = 102
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
......@@ -65,29 +68,19 @@ def train(train_reader,
""" train network """
vocab_size = len(vocab)
#Input data
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None
if not parallel:
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
else:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
cost = network(
pd.read_input(src_wordseq),
pd.read_input(dst_wordseq), vocab_size, hid_size,
init_low_bound, init_high_bound)
pd.write_output(cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr,
......@@ -96,53 +89,74 @@ def train(train_reader,
staircase=True))
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
total_time = 0.0
fetch_list=[avg_cost.name]
for pass_idx in xrange(pass_num):
epoch_idx = pass_idx + 1
print "epoch_%d start" % epoch_idx
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor(
map(lambda x: x[0], data), place)
lod_dst_wordseq = utils.to_lodtensor(
map(lambda x: x[1], data), place)
ret_avg_cost = exe.run(fluid.default_main_program(),
ret_avg_cost = train_exe.run(
feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=[avg_cost],
use_program_cache=True)
avg_ppl = math.exp(ret_avg_cost[0])
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % 100 == 0:
print "step:%d ppl:%.3f" % (i, avg_ppl)
print "step:%d ppl:%.3f" % (i, newest_ppl)
t1 = time.time()
total_time += t1 - t0
print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
total_time / epoch_idx)
print "epoch:%d num_steps:%d time_cost(s):%f" % (
epoch_idx, i, total_time / epoch_idx)
if pass_idx == pass_num - 1:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards()
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" % (total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl %s" % newest_ppl)
else:
print("kpis imikolov_20_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl_card%s %s" % (gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
exe)
print("model saved in %s" % save_dir)
print("finish training")
def get_cards():
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
def train_net():
""" do training """
batch_size = 20
vocab, train_reader, test_reader = utils.prepare_data(
batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
batch_size=batch_size * get_cards(), buffer_size=1000, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab=vocab,
......@@ -152,7 +166,7 @@ def train_net():
batch_size=batch_size,
pass_num=12,
use_cuda=True,
parallel=False,
parallel=True,
model_dir="model",
init_low_bound=-0.1,
init_high_bound=0.1)
......
......@@ -3,8 +3,7 @@ import time
import numpy as np
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle
def to_lodtensor(data, place):
""" convert to LODtensor """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册