From a703e023a1552878fcadf9ee6d50f47c272eeb66 Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Thu, 11 Jul 2019 10:06:24 +0000 Subject: [PATCH] add ce for ELMo --- ELMo/.run_ce.sh | 22 +++++++++++++++ ELMo/__init__.py | 0 ELMo/_ce.py | 64 ++++++++++++++++++++++++++++++++++++++++++ ELMo/train.py | 18 ++++++++++++ ELMo/utils/__init__.py | 0 ELMo/utils/cards.py | 28 ++++++++++++++++++ 6 files changed, 132 insertions(+) create mode 100755 ELMo/.run_ce.sh create mode 100644 ELMo/__init__.py create mode 100644 ELMo/_ce.py create mode 100644 ELMo/utils/__init__.py create mode 100644 ELMo/utils/cards.py diff --git a/ELMo/.run_ce.sh b/ELMo/.run_ce.sh new file mode 100755 index 0000000..d5bce67 --- /dev/null +++ b/ELMo/.run_ce.sh @@ -0,0 +1,22 @@ +train() { +python train.py \ +--train_path='data/train/sentence_file_*' \ +--test_path='data/dev/sentence_file_*' \ +--vocab_path data/vocabulary_min5k.txt \ +--learning_rate 0.2 \ +--use_gpu True \ +--all_train_tokens 35479 \ +--max_epoch 10 \ +--log_interval 5 \ +--dev_interval 20 \ +--local True $@ \ +--enable_ce \ +--shuffle false \ +--random_seed 100 +} + +export CUDA_VISIBLE_DEVICES=0 +train | python _ce.py + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +train | python _ce.py diff --git a/ELMo/__init__.py b/ELMo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ELMo/_ce.py b/ELMo/_ce.py new file mode 100644 index 0000000..ca8dc09 --- /dev/null +++ b/ELMo/_ce.py @@ -0,0 +1,64 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +#sys.path.insert(0, os.environ['ceroot']) +sys.path.append('.') +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_loss_card1_kpi = CostKpi('train_loss_card1', 0.02, 0, actived=True) +train_duration_card1_kpi = DurationKpi( + 'train_duration_card1', 0.06, 0, actived=True) +train_loss_card4_kpi = CostKpi('train_loss_card4', 0.02, 0, actived=True) +train_duration_card4_kpi = DurationKpi( + 'train_duration_card4', 0.06, 0, actived=True) + +tracking_kpis = [ + train_loss_card1_kpi, + train_duration_card1_kpi, + train_loss_card4_kpi, + train_duration_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + The suggestion: + each line in the log should be key, value, for example: + " + train_loss\t1.0 + test_loss\t1.0 + train_loss\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/ELMo/train.py b/ELMo/train.py index 1bd8f36..1e455bf 100755 --- a/ELMo/train.py +++ b/ELMo/train.py @@ -29,6 +29,7 @@ import paddle.fluid.framework as framework from paddle.fluid.executor import Executor import data from args import * +from utils.cards import get_cards import lm_model import logging @@ -502,6 +503,7 @@ def train_loop(args, n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch) n_batches_total = args.max_epoch * n_batches_per_epoch begin_time = time.time() + ce_info = [] for batch_id, batch_list in enumerate(train_reader(), 1): if batch_id > n_batches_total: break @@ -549,6 +551,7 @@ def train_loop(args, "[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}". format(batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl, speed)) + ce_info.append([n_batch_loss / n_batch_cnt, used_time]) n_batch_loss = 0.0 n_batch_cnt = 0 begin_time = time.time() @@ -564,6 +567,21 @@ def train_loop(args, fluid.io.save_persistables( executor=exe, dirname=model_path, main_program=train_prog) + if args.enable_ce: + card_num = get_cards() + ce_loss = 0 + ce_time = 0 + try: + ce_loss = ce_info[-2][0] + ce_time = ce_info[-2][1] + except: + print("ce info error") + print("kpis\ttrain_duration_card%s\t%s" % + (card_num, ce_time)) + print("kpis\ttrain_loss_card%s\t%f" % + (card_num, ce_loss)) + + end_time = time.time() total_time += end_time - start_time epoch_id = int(batch_id / n_batches_per_epoch) diff --git a/ELMo/utils/__init__.py b/ELMo/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ELMo/utils/cards.py b/ELMo/utils/cards.py new file mode 100644 index 0000000..9ba9aa6 --- /dev/null +++ b/ELMo/utils/cards.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +def get_cards(): + """ + get gpu cards number + """ + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num + + -- GitLab