提交 3ed1a0c0 编写于 作者: Z zhengya01 提交者: Hongyu Liu

add ce for neural_machine_translation/rnn_search (#2702)

* add ce for neural_machine_translation/rnn_search

* add rnn_search

* add ce for rnn_search

* add ce for rnn_search

* add ce for rnn_search

* add ce for rnn_search
上级 16f754fe
#!/bin/bash
DATA_PATH=./data/en-vi/
train(){
python train.py \
--src_lang en --tar_lang vi \
--attention True \
--num_layers 2 \
--hidden_size 512 \
--src_vocab_size 17191 \
--tar_vocab_size 7709 \
--batch_size 128 \
--dropout 0.2 \
--init_scale 0.1 \
--max_grad_norm 5.0 \
--train_data_prefix ${DATA_PATH}/train \
--eval_data_prefix ${DATA_PATH}/tst2012 \
--test_data_prefix ${DATA_PATH}/tst2013 \
--vocab_prefix ${DATA_PATH}/vocab \
--use_gpu True \
--max_epoch 2 \
--enable_ce
}
cudaid=${transformer:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
train | python _ce.py
#cudaid=${transformer_m:=0,1,2,3} # use 0,1,2,3 card as default
#export CUDA_VISIBLE_DEVICES=$cudaid
#train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_ppl_card1_kpi = CostKpi('train_ppl_card1', 0.02, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.02, 0, actived=True)
#train_ppl_card4_kpi = CostKpi('train_ppl_card4', 0.02, 0, actived=True)
#train_duration_card4_kpi = DurationKpi(
# 'train_duration_card4', 0.02, 0, actived=True)
tracking_kpis = [
train_ppl_card1_kpi,
train_duration_card1_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
......@@ -113,5 +113,11 @@ def parse_args():
default=False,
help='Whether using gpu [True|False]')
parser.add_argument(
"--enable_ce",
action='store_true',
help="The flag indicating whether to run the task "
"for continuous evaluation.")
args = parser.parse_args()
return args
......@@ -143,14 +143,14 @@ def raw_mono_data(vocab_file, file_path):
return (test_src, test_tar)
def get_data_iter(raw_data, batch_size, mode='train'):
def get_data_iter(raw_data, batch_size, mode='train', enable_ce=False):
src_data, tar_data = raw_data
data_len = len(src_data)
index = np.arange(data_len)
if mode == "train":
if mode == "train" and not enable_ce:
np.random.shuffle(index)
def to_pad_np(data, source=False):
......
......@@ -56,6 +56,11 @@ def train():
init_scale = args.init_scale
max_grad_norm = args.max_grad_norm
hidden_size = args.hidden_size
if args.enable_ce:
fluid.default_main_program().random_seed = 102
framework.default_startup_program().random_seed = 102
# Training process
if args.attention:
......@@ -155,11 +160,17 @@ def train():
return ppl
ce_time = []
ce_ppl = []
max_epoch = args.max_epoch
for epoch_id in range(max_epoch):
start_time = time.time()
print("epoch id", epoch_id)
train_data_iter = reader.get_data_iter(train_data, batch_size)
if args.enable_ce:
train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True)
else:
train_data_iter = reader.get_data_iter(train_data, batch_size)
total_loss = 0
word_count = 0.0
......@@ -177,8 +188,12 @@ def train():
if batch_id > 0 and batch_id % 100 == 0:
print("ppl", batch_id, np.exp(total_loss / word_count))
ce_ppl.append(np.exp(total_loss / word_count))
total_loss = 0.0
word_count = 0.0
end_time = time.time()
time_gap = end_time - start_time
ce_time.append(time_gap)
dir_name = args.model_path + "/epoch_" + str(epoch_id)
print("begin to save", dir_name)
......@@ -189,6 +204,28 @@ def train():
test_ppl = eval(test_data)
print("test ppl", test_ppl)
if args.enable_ce:
card_num = get_cards()
_ppl = 0
_time = 0
try:
_time = ce_time[-1]
_ppl = ce_ppl[-1]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" %
(card_num, _time))
print("kpis\ttrain_ppl_card%s\t%f" %
(card_num, _ppl))
def get_cards():
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
if __name__ == '__main__':
train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册