From a703e023a1552878fcadf9ee6d50f47c272eeb66 Mon Sep 17 00:00:00 2001
From: zhengya01 <zhengya01@baidu.com>
Date: Thu, 11 Jul 2019 10:06:24 +0000
Subject: [PATCH] add ce for ELMo

---
 ELMo/.run_ce.sh        | 22 +++++++++++++++
 ELMo/__init__.py       |  0
 ELMo/_ce.py            | 64 ++++++++++++++++++++++++++++++++++++++++++
 ELMo/train.py          | 18 ++++++++++++
 ELMo/utils/__init__.py |  0
 ELMo/utils/cards.py    | 28 ++++++++++++++++++
 6 files changed, 132 insertions(+)
 create mode 100755 ELMo/.run_ce.sh
 create mode 100644 ELMo/__init__.py
 create mode 100644 ELMo/_ce.py
 create mode 100644 ELMo/utils/__init__.py
 create mode 100644 ELMo/utils/cards.py

diff --git a/ELMo/.run_ce.sh b/ELMo/.run_ce.sh
new file mode 100755
index 0000000..d5bce67
--- /dev/null
+++ b/ELMo/.run_ce.sh
@@ -0,0 +1,22 @@
+train() {
+python  train.py \
+--train_path='data/train/sentence_file_*'  \
+--test_path='data/dev/sentence_file_*'  \
+--vocab_path data/vocabulary_min5k.txt \
+--learning_rate 0.2 \
+--use_gpu True \
+--all_train_tokens 35479 \
+--max_epoch 10 \
+--log_interval 5 \
+--dev_interval 20 \
+--local True $@ \
+--enable_ce \
+--shuffle false \
+--random_seed 100
+}
+
+export CUDA_VISIBLE_DEVICES=0 
+train | python _ce.py
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+train | python _ce.py
diff --git a/ELMo/__init__.py b/ELMo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ELMo/_ce.py b/ELMo/_ce.py
new file mode 100644
index 0000000..ca8dc09
--- /dev/null
+++ b/ELMo/_ce.py
@@ -0,0 +1,64 @@
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+#sys.path.insert(0, os.environ['ceroot'])
+sys.path.append('.')
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_loss_card1_kpi = CostKpi('train_loss_card1', 0.02, 0, actived=True)
+train_duration_card1_kpi = DurationKpi(
+    'train_duration_card1', 0.06, 0, actived=True)
+train_loss_card4_kpi = CostKpi('train_loss_card4', 0.02, 0, actived=True)
+train_duration_card4_kpi = DurationKpi(
+    'train_duration_card4', 0.06, 0, actived=True)
+
+tracking_kpis = [
+        train_loss_card1_kpi,
+        train_duration_card1_kpi,
+        train_loss_card4_kpi,
+        train_duration_card4_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_loss\t1.0
+    test_loss\t1.0
+    train_loss\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/ELMo/train.py b/ELMo/train.py
index 1bd8f36..1e455bf 100755
--- a/ELMo/train.py
+++ b/ELMo/train.py
@@ -29,6 +29,7 @@ import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 import data
 from args import *
+from utils.cards import get_cards
 import lm_model
 import logging
 
@@ -502,6 +503,7 @@ def train_loop(args,
     n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch)
     n_batches_total = args.max_epoch * n_batches_per_epoch
     begin_time = time.time()
+    ce_info = []
     for batch_id, batch_list in enumerate(train_reader(), 1):
         if batch_id > n_batches_total:
             break
@@ -549,6 +551,7 @@ def train_loop(args,
                 "[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}".
                 format(batch_id, n_batch_loss / n_batch_cnt, ppl, smoothed_ppl,
                        speed))
+            ce_info.append([n_batch_loss / n_batch_cnt, used_time])
             n_batch_loss = 0.0
             n_batch_cnt = 0
             begin_time = time.time()
@@ -564,6 +567,21 @@ def train_loop(args,
             fluid.io.save_persistables(
                 executor=exe, dirname=model_path, main_program=train_prog)
 
+    if args.enable_ce:
+        card_num = get_cards()
+        ce_loss = 0
+        ce_time = 0
+        try:
+            ce_loss = ce_info[-2][0]
+            ce_time = ce_info[-2][1]
+        except:
+            print("ce info error")
+        print("kpis\ttrain_duration_card%s\t%s" %
+            (card_num, ce_time))
+        print("kpis\ttrain_loss_card%s\t%f" %
+            (card_num, ce_loss))
+
+
     end_time = time.time()
     total_time += end_time - start_time
     epoch_id = int(batch_id / n_batches_per_epoch)
diff --git a/ELMo/utils/__init__.py b/ELMo/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ELMo/utils/cards.py b/ELMo/utils/cards.py
new file mode 100644
index 0000000..9ba9aa6
--- /dev/null
+++ b/ELMo/utils/cards.py
@@ -0,0 +1,28 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
+
+
-- 
GitLab