提交 a703e023 编写于 作者: Z zhengya01

add ce for ELMo

上级 3a4d6312
train() {
python train.py \
--train_path='data/train/sentence_file_*' \
--test_path='data/dev/sentence_file_*' \
--vocab_path data/vocabulary_min5k.txt \
--learning_rate 0.2 \
--use_gpu True \
--all_train_tokens 35479 \
--max_epoch 10 \
--log_interval 5 \
--dev_interval 20 \
--local True $@ \
--enable_ce \
--shuffle false \
--random_seed 100
}
export CUDA_VISIBLE_DEVICES=0
train | python _ce.py
export CUDA_VISIBLE_DEVICES=0,1,2,3
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
#sys.path.insert(0, os.environ['ceroot'])
sys.path.append('.')
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.02, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.06, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.02, 0, actived=True)
train_duration_card4_kpi = DurationKpi(
'train_duration_card4', 0.06, 0, actived=True)
tracking_kpis = [
train_loss_card1_kpi,
train_duration_card1_kpi,
train_loss_card4_kpi,
train_duration_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_loss\t1.0
test_loss\t1.0
train_loss\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
......@@ -29,6 +29,7 @@ import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
import data
from args import *
from utils.cards import get_cards
import lm_model
import logging
......@@ -502,6 +503,7 @@ def train_loop(args,
n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch)
n_batches_total = args.max_epoch * n_batches_per_epoch
begin_time = time.time()
ce_info = []
for batch_id, batch_list in enumerate(train_reader(), 1):
if batch_id > n_batches_total:
break
......@@ -549,6 +551,7 @@ def train_loop(args,
"[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}".
format(batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl,
speed))
ce_info.append([n_batch_loss / n_batch_cnt, used_time])
n_batch_loss = 0.0
n_batch_cnt = 0
begin_time = time.time()
......@@ -564,6 +567,21 @@ def train_loop(args,
fluid.io.save_persistables(
executor=exe, dirname=model_path, main_program=train_prog)
if args.enable_ce:
card_num = get_cards()
ce_loss = 0
ce_time = 0
try:
ce_loss = ce_info[-2][0]
ce_time = ce_info[-2][1]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" %
(card_num, ce_time))
print("kpis\ttrain_loss_card%s\t%f" %
(card_num, ce_loss))
end_time = time.time()
total_time += end_time - start_time
epoch_id = int(batch_id / n_batches_per_epoch)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
def get_cards():
"""
get gpu cards number
"""
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册