diff --git a/ELMo/.run_ce.sh b/ELMo/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..d5bce67b7cd414f2d7be70258e5090f93731cab2 --- /dev/null +++ b/ELMo/.run_ce.sh @@ -0,0 +1,22 @@ +train() { +python train.py \ +--train_path='data/train/sentence_file_*' \ +--test_path='data/dev/sentence_file_*' \ +--vocab_path data/vocabulary_min5k.txt \ +--learning_rate 0.2 \ +--use_gpu True \ +--all_train_tokens 35479 \ +--max_epoch 10 \ +--log_interval 5 \ +--dev_interval 20 \ +--local True $@ \ +--enable_ce \ +--shuffle false \ +--random_seed 100 +} + +export CUDA_VISIBLE_DEVICES=0 +train | python _ce.py + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +train | python _ce.py diff --git a/ELMo/__init__.py b/ELMo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ELMo/_ce.py b/ELMo/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..44f1527feab45cb345f27489dbb4019f65254867 --- /dev/null +++ b/ELMo/_ce.py @@ -0,0 +1,63 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.insert(0, os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_loss_card1_kpi = CostKpi('train_loss_card1', 0.005, 0, actived=True) +train_duration_card1_kpi = DurationKpi( + 'train_duration_card1', 0.01, 0, actived=True) +train_loss_card4_kpi = CostKpi('train_loss_card4', 0.01, 0, actived=True) +train_duration_card4_kpi = DurationKpi( + 'train_duration_card4', 0.01, 0, actived=True) + +tracking_kpis = [ + train_loss_card1_kpi, + train_duration_card1_kpi, + train_loss_card4_kpi, + train_duration_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + The suggestion: + each line in the log should be key, value, for example: + " + train_loss\t1.0 + test_loss\t1.0 + train_loss\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/ELMo/train.py b/ELMo/train.py index 1bd8f36eb3c63d619d3f093a35754c48d8cc5a0d..1e455bff1293c6bec046592038c2109dc057c943 100755 --- a/ELMo/train.py +++ b/ELMo/train.py @@ -29,6 +29,7 @@ import paddle.fluid.framework as framework from paddle.fluid.executor import Executor import data from args import * +from utils.cards import get_cards import lm_model import logging @@ -502,6 +503,7 @@ def train_loop(args, n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch) n_batches_total = args.max_epoch * n_batches_per_epoch begin_time = time.time() + ce_info = [] for batch_id, batch_list in enumerate(train_reader(), 1): if batch_id > n_batches_total: break @@ -549,6 +551,7 @@ def train_loop(args, "[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}". format(batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl, speed)) + ce_info.append([n_batch_loss / n_batch_cnt, used_time]) n_batch_loss = 0.0 n_batch_cnt = 0 begin_time = time.time() @@ -564,6 +567,21 @@ def train_loop(args, fluid.io.save_persistables( executor=exe, dirname=model_path, main_program=train_prog) + if args.enable_ce: + card_num = get_cards() + ce_loss = 0 + ce_time = 0 + try: + ce_loss = ce_info[-2][0] + ce_time = ce_info[-2][1] + except: + print("ce info error") + print("kpis\ttrain_duration_card%s\t%s" % + (card_num, ce_time)) + print("kpis\ttrain_loss_card%s\t%f" % + (card_num, ce_loss)) + + end_time = time.time() total_time += end_time - start_time epoch_id = int(batch_id / n_batches_per_epoch) diff --git a/ELMo/utils/__init__.py b/ELMo/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ELMo/utils/cards.py b/ELMo/utils/cards.py new file mode 100644 index 0000000000000000000000000000000000000000..9ba9aa6d2ee81eebfc8c02bdef5d50dff7d96f6e --- /dev/null +++ b/ELMo/utils/cards.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +def get_cards(): + """ + get gpu cards number + """ + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num + +