From 4b5d8b426ef35c863fba4db9741f2ca104db96bf Mon Sep 17 00:00:00 2001 From: Divano Date: Wed, 17 Jul 2019 15:25:18 +0800 Subject: [PATCH] Add ce to CycleGAN (#2807) * Update mnist_dygraph.py fix bug * add muti card support for se_resnext * add some description to readme.md * add ce for cyclegan * fix code style * add ce for ptb_lm --- dygraph/cycle_gan/.run_ce.sh | 8 +++++ dygraph/cycle_gan/_ce.py | 62 ++++++++++++++++++++++++++++++++++ dygraph/cycle_gan/train.py | 19 ++++++++++- dygraph/ptb_lm/.run_ce.sh | 9 +++++ dygraph/ptb_lm/_ce.py | 65 ++++++++++++++++++++++++++++++++++++ dygraph/ptb_lm/args.py | 2 +- dygraph/ptb_lm/ptb_dy.py | 13 +++++++- 7 files changed, 175 insertions(+), 3 deletions(-) create mode 100755 dygraph/cycle_gan/.run_ce.sh create mode 100644 dygraph/cycle_gan/_ce.py create mode 100755 dygraph/ptb_lm/.run_ce.sh create mode 100644 dygraph/ptb_lm/_ce.py diff --git a/dygraph/cycle_gan/.run_ce.sh b/dygraph/cycle_gan/.run_ce.sh new file mode 100755 index 00000000..cfae7788 --- /dev/null +++ b/dygraph/cycle_gan/.run_ce.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. +# dygraph single card +export FLAGS_cudnn_deterministic=True +export CUDA_VISIBLE_DEVICES=0 +python train.py --ce --epoch 1 | python _ce.py + diff --git a/dygraph/cycle_gan/_ce.py b/dygraph/cycle_gan/_ce.py new file mode 100644 index 00000000..1d2d4b89 --- /dev/null +++ b/dygraph/cycle_gan/_ce.py @@ -0,0 +1,62 @@ +####this file is only used for continuous evaluation test! +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +g_loss = CostKpi('g_loss', 0.3, 0, actived=True, desc="g loss") +g_A_loss = CostKpi('g_A_loss', 0.3, 0, actived=True, desc="g A loss") +g_B_loss = CostKpi('g_B_loss', 0.3, 0, actived=True, desc="g B loss") +d_A_loss = CostKpi('d_A_loss', 0.3, 0, actived=True, desc="d A loss") +d_B_loss = CostKpi('d_B_loss', 0.3, 0, actived=True, desc="d B loss") +tracking_kpis = [g_loss, g_A_loss, g_B_loss, + d_A_loss, d_B_loss] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/dygraph/cycle_gan/train.py b/dygraph/cycle_gan/train.py index ab6b4b88..ff74979e 100644 --- a/dygraph/cycle_gan/train.py +++ b/dygraph/cycle_gan/train.py @@ -17,6 +17,7 @@ from trainer import * from paddle.fluid.dygraph.base import to_variable import six parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument("--ce", action="store_true", help="run ce") add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('batch_size', int, 1, "Minibatch size.") @@ -26,6 +27,7 @@ add_arg('init_model', str, None, "The init model file of director add_arg('save_checkpoints', bool, True, "Whether to save checkpoints.") # yapf: enable + lambda_A = 10.0 lambda_B = 10.0 lambda_identity = 0.5 @@ -51,10 +53,17 @@ def train(args): shuffle = True data_shape = [-1] + data_reader.image_shape() print(data_shape) + if args.ce: + print("ce mode") + seed = 33 + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + shuffle = False A_pool = ImagePool() B_pool = ImagePool() - A_reader = paddle.batch( data_reader.a_reader(shuffle=shuffle), args.batch_size)() B_reader = paddle.batch( @@ -154,6 +163,14 @@ def train(args): losses[1].append(d_loss_A[0]) sys.stdout.flush() batch_id += 1 + if args.ce and batch_id == 500: + print("kpis\tg_loss\t%0.3f" % g_loss_out[0]) + print("kpis\tg_A_loss\t%0.3f" % g_A_loss.numpy()[0]) + print("kpis\tg_B_loss\t%0.3f" % g_B_loss.numpy()[0]) + print("kpis\td_A_loss\t%0.3f" % d_loss_A.numpy()[0]) + print("kpis\td_B_loss\t%0.3f" % d_loss_B.numpy()[0]) + break + if args.save_checkpoints: fluid.dygraph.save_persistables(cycle_gan.state_dict(),args.output+"/checkpoints/{}".format(epoch)) diff --git a/dygraph/ptb_lm/.run_ce.sh b/dygraph/ptb_lm/.run_ce.sh new file mode 100755 index 00000000..2476c8ba --- /dev/null +++ b/dygraph/ptb_lm/.run_ce.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. +# dygraph single card +export FLAGS_cudnn_deterministic=True +export CUDA_VISIBLE_DEVICES=0 +python ptb_dy.py --data_path data/simple-examples/data/ \ + --ce --model_type small | python _ce.py + diff --git a/dygraph/ptb_lm/_ce.py b/dygraph/ptb_lm/_ce.py new file mode 100644 index 00000000..46cd850e --- /dev/null +++ b/dygraph/ptb_lm/_ce.py @@ -0,0 +1,65 @@ +####this file is only used for continuous evaluation test! +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_ppl = AccKpi('train_ppl', 3, 0, actived=True, desc="train ppl") +test_ppl = AccKpi('test_ppl', 3, 0, actived=True, desc='test ppl') +#train_speed_kpi = DurationKpi( +# 'train_speed', +# 0.05, +# 0, +# actived=True, +# unit_repr='seconds/image', +# desc='train speed in one GPU card') +tracking_kpis = [train_ppl, test_ppl] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/dygraph/ptb_lm/args.py b/dygraph/ptb_lm/args.py index 1c6957b2..294373bd 100644 --- a/dygraph/ptb_lm/args.py +++ b/dygraph/ptb_lm/args.py @@ -40,6 +40,6 @@ def parse_args(): parser.add_argument( '--log_path', help='path of the log file. If not set, logs are printed to console') - parser.add_argument('--enable_ce', action='store_true') + parser.add_argument('--ce', action='store_true', help="run ce") args = parser.parse_args() return args diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index f93a8008..3722a1ce 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -292,6 +292,13 @@ def train_ptb_lm(): return with fluid.dygraph.guard(core.CUDAPlace(0)): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + max_epoch = 1 ptb_model = PtbModel( "ptb_model", hidden_size=hidden_size, @@ -315,7 +322,7 @@ def train_ptb_lm(): batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps - log_interval = total_batch_size // 10 + log_interval = total_batch_size // 20 bd = [] lr_arr = [1.0] @@ -361,6 +368,8 @@ def train_ptb_lm(): print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) + if args.ce: + print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): @@ -407,6 +416,8 @@ def train_ptb_lm(): print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("ppl ", epoch_id, ppl[0]) + if args.ce: + print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) eval(ptb_model, test_data) -- GitLab