Add ce to CycleGAN (#2807)

* Update mnist_dygraph.py fix bug * add muti card support for se_resnext * add some description to readme.md * add ce for cyclegan * fix code style * add ce for ptb_lm

Add ce to CycleGAN (#2807)
* Update mnist_dygraph.py fix bug * add muti card support for se_resnext * add some description to readme.md * add ce for cyclegan * fix code style * add ce for ptb_lm
4b5d8b42 · Divano · Hongyu Liu · 0f4ef113 · 4b5d8b42 · 4b5d8b42
7 changed file
--- a/dygraph/cycle_gan/.run_ce.sh
+++ b/dygraph/cycle_gan/.run_ce.sh
+#!/bin/bash
+# This file is only used for continuous evaluation.
+# dygraph single card
+export FLAGS_cudnn_deterministic=True
+export CUDA_VISIBLE_DEVICES=0
+python train.py --ce --epoch 1 | python _ce.py
--- a/dygraph/cycle_gan/_ce.py
+++ b/dygraph/cycle_gan/_ce.py
+####this file is only used for continuous evaluation test!
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+#### NOTE kpi.py should shared in models in some way!!!!
+g_loss = CostKpi('g_loss', 0.3, 0, actived=True, desc="g loss")
+g_A_loss = CostKpi('g_A_loss', 0.3, 0, actived=True, desc="g A loss")
+g_B_loss = CostKpi('g_B_loss', 0.3, 0, actived=True, desc="g B loss")
+d_A_loss = CostKpi('d_A_loss', 0.3, 0, actived=True, desc="d A loss")
+d_B_loss = CostKpi('d_B_loss', 0.3, 0, actived=True, desc="d B loss")
+tracking_kpis = [g_loss, g_A_loss, g_B_loss,
+                 d_A_loss, d_B_loss]
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/dygraph/cycle_gan/train.py
+++ b/dygraph/cycle_gan/train.py
@@ -17,6 +17,7 @@ from trainer import *
 from paddle.fluid.dygraph.base import to_variable
 import six
 parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--ce", action="store_true", help="run ce")
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('batch_size',        int,   1,          "Minibatch size.")
@@ -26,6 +27,7 @@ add_arg('init_model',        str,   None,       "The init model file of director
 add_arg('save_checkpoints',  bool,  True,       "Whether to save checkpoints.")
 # yapf: enable
 lambda_A = 10.0
 lambda_B = 10.0
 lambda_identity = 0.5
@@ -51,10 +53,17 @@ def train(args):
        shuffle = True
        data_shape = [-1] + data_reader.image_shape()
        print(data_shape)
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            random.seed(seed)
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            shuffle = False
        A_pool = ImagePool()
        B_pool = ImagePool()
        A_reader = paddle.batch(
            data_reader.a_reader(shuffle=shuffle), args.batch_size)()
        B_reader = paddle.batch(
@@ -154,6 +163,14 @@ def train(args):
                losses[1].append(d_loss_A[0])
                sys.stdout.flush()
                batch_id += 1
+                if args.ce and batch_id == 500:
+                    print("kpis\tg_loss\t%0.3f" % g_loss_out[0])
+                    print("kpis\tg_A_loss\t%0.3f" % g_A_loss.numpy()[0])
+                    print("kpis\tg_B_loss\t%0.3f" % g_B_loss.numpy()[0])
+                    print("kpis\td_A_loss\t%0.3f" % d_loss_A.numpy()[0])
+                    print("kpis\td_B_loss\t%0.3f" % d_loss_B.numpy()[0])
+                    break
            if args.save_checkpoints:
                fluid.dygraph.save_persistables(cycle_gan.state_dict(),args.output+"/checkpoints/{}".format(epoch))

--- a/dygraph/ptb_lm/.run_ce.sh
+++ b/dygraph/ptb_lm/.run_ce.sh
+#!/bin/bash
+# This file is only used for continuous evaluation.
+# dygraph single card
+export FLAGS_cudnn_deterministic=True
+export CUDA_VISIBLE_DEVICES=0
+python ptb_dy.py --data_path data/simple-examples/data/ \
+               --ce --model_type small | python _ce.py
--- a/dygraph/ptb_lm/_ce.py
+++ b/dygraph/ptb_lm/_ce.py
+####this file is only used for continuous evaluation test!
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+#### NOTE kpi.py should shared in models in some way!!!!
+train_ppl = AccKpi('train_ppl', 3, 0, actived=True, desc="train ppl")
+test_ppl = AccKpi('test_ppl', 3, 0, actived=True, desc='test ppl')
+#train_speed_kpi = DurationKpi(
+#    'train_speed',
+#    0.05,
+#    0,
+#    actived=True,
+#    unit_repr='seconds/image',
+#    desc='train speed in one GPU card')
+tracking_kpis = [train_ppl, test_ppl]
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/dygraph/ptb_lm/args.py
+++ b/dygraph/ptb_lm/args.py
@@ -40,6 +40,6 @@ def parse_args():
    parser.add_argument(
        '--log_path',
        help='path of the log file. If not set, logs are printed to console')
-    parser.add_argument('--enable_ce', action='store_true')
+    parser.add_argument('--ce', action='store_true', help="run ce")
    args = parser.parse_args()
    return args
--- a/dygraph/ptb_lm/ptb_dy.py
+++ b/dygraph/ptb_lm/ptb_dy.py
@@ -292,6 +292,13 @@ def train_ptb_lm():
        return
    with fluid.dygraph.guard(core.CUDAPlace(0)):
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            max_epoch = 1
        ptb_model = PtbModel(
            "ptb_model",
            hidden_size=hidden_size,
@@ -315,7 +322,7 @@ def train_ptb_lm():
        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
-        log_interval = total_batch_size // 10
+        log_interval = total_batch_size // 20
        bd = []
        lr_arr = [1.0]
@@ -361,6 +368,8 @@ def train_ptb_lm():
            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])
+            if args.ce:
+                print("kpis\ttest_ppl\t%0.3f" % ppl[0])
        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        for epoch_id in range(max_epoch):
@@ -407,6 +416,8 @@ def train_ptb_lm():
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("ppl ", epoch_id, ppl[0])
+            if args.ce:
+                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
        eval(ptb_model, test_data)