set ce flag for language_model

0c847657 · guochaorong · 3812d044 · 0c847657 · 0c847657 · 0c847657
Showing with 57 addition and 25 deletion

fluid/language_model/.run_ce.sh fluid/language_model/.run_ce.sh +1 -1

fluid/language_model/train.py fluid/language_model/train.py +39 -19

fluid/language_model/utils.py fluid/language_model/utils.py +17 -5

未找到文件。
--- a/fluid/language_model/.run_ce.sh
+++ b/fluid/language_model/.run_ce.sh
@@ -8,7 +8,7 @@ export CUDA_VISIBLE_DEVICES=$cudaid

 FLAGS_benchmark=true  python train.py | python _ce.py

-cudaid=${language_model_m:=0,1,2,3} # use 0-th card as default
+cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
 export CUDA_VISIBLE_DEVICES=$cudaid

 FLAGS_benchmark=true  python train.py | python _ce.py
--- a/fluid/language_model/train.py
+++ b/fluid/language_model/train.py
@@ -4,14 +4,25 @@ import time

 import numpy as np
 import math
-
+import argparse
 import paddle.fluid as fluid
 import paddle

 import utils

-# random seed must set before configuring the network.
-fluid.default_startup_program().random_seed = 102
+SEED = 102
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("language_model benchmark.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run \
+        the task with continuous evaluation logs.')
+    args = parser.parse_args()
+    return args
+

 def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
    """ network definition """
@@ -66,6 +77,11 @@ def train(train_reader,
          init_low_bound=-0.04,
          init_high_bound=0.04):
    """ train network """
+
+    args = parse_args()
+    if args.enable_ce:
+        # random seed must set before configuring the network.
+        fluid.default_startup_program().random_seed = SEED
    vocab_size = len(vocab)

    #Input data
@@ -77,7 +93,7 @@ def train(train_reader,
    # Train program
    avg_cost = None
    cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
-                       init_low_bound, init_high_bound)
+                   init_low_bound, init_high_bound)
    avg_cost = fluid.layers.mean(x=cost)

    # Optimization to minimize lost
@@ -97,7 +113,7 @@ def train(train_reader,
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)

    total_time = 0.0
-    fetch_list=[avg_cost.name]
+    fetch_list = [avg_cost.name]
    for pass_idx in xrange(pass_num):
        epoch_idx = pass_idx + 1
        print "epoch_%d start" % epoch_idx
@@ -111,12 +127,11 @@ def train(train_reader,
                map(lambda x: x[0], data), place)
            lod_dst_wordseq = utils.to_lodtensor(
                map(lambda x: x[1], data), place)
-            ret_avg_cost = train_exe.run(
-                                   feed={
-                                       "src_wordseq": lod_src_wordseq,
-                                       "dst_wordseq": lod_dst_wordseq
-                                   },
-                                   fetch_list=fetch_list)
+            ret_avg_cost = train_exe.run(feed={
+                "src_wordseq": lod_src_wordseq,
+                "dst_wordseq": lod_dst_wordseq
+            },
+                                         fetch_list=fetch_list)
            avg_ppl = np.exp(ret_avg_cost[0])
            newest_ppl = np.mean(avg_ppl)
            if i % 100 == 0:
@@ -124,39 +139,44 @@ def train(train_reader,

        t1 = time.time()
        total_time += t1 - t0
-        print "epoch:%d num_steps:%d time_cost(s):%f" % (
-            epoch_idx, i, total_time / epoch_idx)
+        print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
+                                                         total_time / epoch_idx)

-        if pass_idx == pass_num - 1:
+        if pass_idx == pass_num - 1 and args.enable_ce:
            #Note: The following logs are special for CE monitoring.
            #Other situations do not need to care about these logs.
            gpu_num = get_cards()
            if gpu_num == 1:
-                print("kpis	imikolov_20_pass_duration	%s" % (total_time / epoch_idx))
+                print("kpis	imikolov_20_pass_duration	%s" %
+                      (total_time / epoch_idx))
                print("kpis	imikolov_20_avg_ppl	%s" % newest_ppl)
            else:
                print("kpis	imikolov_20_pass_duration_card%s	%s" % \
                                (gpu_num, total_time / epoch_idx))
-                print("kpis	imikolov_20_avg_ppl_card%s	%s" % (gpu_num, newest_ppl))
+                print("kpis	imikolov_20_avg_ppl_card%s	%s" %
+                      (gpu_num, newest_ppl))
        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["src_wordseq", "dst_wordseq"]
        fetch_vars = [avg_cost]
-        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
-                                      exe)
+        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
        print("model saved in %s" % save_dir)

    print("finish training")

+
 def get_cards():
    cards = os.environ.get('CUDA_VISIBLE_DEVICES')
    num = len(cards.split(","))
    return num

+
 def train_net():
    """ do training """
    batch_size = 20
+    args = parse_args()
    vocab, train_reader, test_reader = utils.prepare_data(
-        batch_size=batch_size * get_cards(), buffer_size=1000, word_freq_threshold=0)
+        batch_size=batch_size * get_cards(), buffer_size=1000, \
+        word_freq_threshold=0, enable_ce = args.enable_ce)
    train(
        train_reader=train_reader,
        vocab=vocab,

--- a/fluid/language_model/utils.py
+++ b/fluid/language_model/utils.py
@@ -5,6 +5,7 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle

+
 def to_lodtensor(data, place):
    """ convert to LODtensor """
    seq_lens = [len(seq) for seq in data]
@@ -21,17 +22,28 @@ def to_lodtensor(data, place):
    return res


-def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
+def prepare_data(batch_size,
+                 buffer_size=1000,
+                 word_freq_threshold=0,
+                 enable_ce=False):
    """ prepare the English Pann Treebank (PTB) data """
    vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
+    if enable_ce:
+        train_reader = paddle.batch(
            paddle.dataset.imikolov.train(
                vocab,
                buffer_size,
                data_type=paddle.dataset.imikolov.DataType.SEQ),
-            buf_size=buffer_size),
-        batch_size)
+            batch_size)
+    else:
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.imikolov.train(
+                    vocab,
+                    buffer_size,
+                    data_type=paddle.dataset.imikolov.DataType.SEQ),
+                buf_size=buffer_size),
+            batch_size)
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(
            vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),