diff --git a/dygraph/cycle_gan/.run_ce.sh b/dygraph/cycle_gan/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..cfae7788f4ccf6659877d11ea6b51e1373be2452 --- /dev/null +++ b/dygraph/cycle_gan/.run_ce.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. +# dygraph single card +export FLAGS_cudnn_deterministic=True +export CUDA_VISIBLE_DEVICES=0 +python train.py --ce --epoch 1 | python _ce.py + diff --git a/dygraph/cycle_gan/_ce.py b/dygraph/cycle_gan/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..1d2d4b89b85579c3443a1847528aabb76da7d0ba --- /dev/null +++ b/dygraph/cycle_gan/_ce.py @@ -0,0 +1,62 @@ +####this file is only used for continuous evaluation test! +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +g_loss = CostKpi('g_loss', 0.3, 0, actived=True, desc="g loss") +g_A_loss = CostKpi('g_A_loss', 0.3, 0, actived=True, desc="g A loss") +g_B_loss = CostKpi('g_B_loss', 0.3, 0, actived=True, desc="g B loss") +d_A_loss = CostKpi('d_A_loss', 0.3, 0, actived=True, desc="d A loss") +d_B_loss = CostKpi('d_B_loss', 0.3, 0, actived=True, desc="d B loss") +tracking_kpis = [g_loss, g_A_loss, g_B_loss, + d_A_loss, d_B_loss] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/dygraph/cycle_gan/train.py b/dygraph/cycle_gan/train.py index ab6b4b882c9c074cbe0edfc10d269106d6323894..ff74979e25f537a648ba6a788f1387fd32d6a2eb 100644 --- a/dygraph/cycle_gan/train.py +++ b/dygraph/cycle_gan/train.py @@ -17,6 +17,7 @@ from trainer import * from paddle.fluid.dygraph.base import to_variable import six parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument("--ce", action="store_true", help="run ce") add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('batch_size', int, 1, "Minibatch size.") @@ -26,6 +27,7 @@ add_arg('init_model', str, None, "The init model file of director add_arg('save_checkpoints', bool, True, "Whether to save checkpoints.") # yapf: enable + lambda_A = 10.0 lambda_B = 10.0 lambda_identity = 0.5 @@ -51,10 +53,17 @@ def train(args): shuffle = True data_shape = [-1] + data_reader.image_shape() print(data_shape) + if args.ce: + print("ce mode") + seed = 33 + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + shuffle = False A_pool = ImagePool() B_pool = ImagePool() - A_reader = paddle.batch( data_reader.a_reader(shuffle=shuffle), args.batch_size)() B_reader = paddle.batch( @@ -154,6 +163,14 @@ def train(args): losses[1].append(d_loss_A[0]) sys.stdout.flush() batch_id += 1 + if args.ce and batch_id == 500: + print("kpis\tg_loss\t%0.3f" % g_loss_out[0]) + print("kpis\tg_A_loss\t%0.3f" % g_A_loss.numpy()[0]) + print("kpis\tg_B_loss\t%0.3f" % g_B_loss.numpy()[0]) + print("kpis\td_A_loss\t%0.3f" % d_loss_A.numpy()[0]) + print("kpis\td_B_loss\t%0.3f" % d_loss_B.numpy()[0]) + break + if args.save_checkpoints: fluid.dygraph.save_persistables(cycle_gan.state_dict(),args.output+"/checkpoints/{}".format(epoch)) diff --git a/dygraph/ptb_lm/.run_ce.sh b/dygraph/ptb_lm/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..2476c8ba2b884ba91ea64aa5c7f8d35714a5d339 --- /dev/null +++ b/dygraph/ptb_lm/.run_ce.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. +# dygraph single card +export FLAGS_cudnn_deterministic=True +export CUDA_VISIBLE_DEVICES=0 +python ptb_dy.py --data_path data/simple-examples/data/ \ + --ce --model_type small | python _ce.py + diff --git a/dygraph/ptb_lm/_ce.py b/dygraph/ptb_lm/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..46cd850e2c114f5f47517f41d19872647c76cdfc --- /dev/null +++ b/dygraph/ptb_lm/_ce.py @@ -0,0 +1,65 @@ +####this file is only used for continuous evaluation test! +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_ppl = AccKpi('train_ppl', 3, 0, actived=True, desc="train ppl") +test_ppl = AccKpi('test_ppl', 3, 0, actived=True, desc='test ppl') +#train_speed_kpi = DurationKpi( +# 'train_speed', +# 0.05, +# 0, +# actived=True, +# unit_repr='seconds/image', +# desc='train speed in one GPU card') +tracking_kpis = [train_ppl, test_ppl] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/dygraph/ptb_lm/args.py b/dygraph/ptb_lm/args.py index 1c6957b2bf90eb45bf009e022e7ee27cc4742f4a..294373bd012aadd9421d9bdfa67bb6059c6d839f 100644 --- a/dygraph/ptb_lm/args.py +++ b/dygraph/ptb_lm/args.py @@ -40,6 +40,6 @@ def parse_args(): parser.add_argument( '--log_path', help='path of the log file. If not set, logs are printed to console') - parser.add_argument('--enable_ce', action='store_true') + parser.add_argument('--ce', action='store_true', help="run ce") args = parser.parse_args() return args diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index f93a80085be5704d3ed970f05475c136005e73de..3722a1cebd2947b9dfccd73bd304aa4c6add1cf7 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -292,6 +292,13 @@ def train_ptb_lm(): return with fluid.dygraph.guard(core.CUDAPlace(0)): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + max_epoch = 1 ptb_model = PtbModel( "ptb_model", hidden_size=hidden_size, @@ -315,7 +322,7 @@ def train_ptb_lm(): batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps - log_interval = total_batch_size // 10 + log_interval = total_batch_size // 20 bd = [] lr_arr = [1.0] @@ -361,6 +368,8 @@ def train_ptb_lm(): print("eval finished") ppl = np.exp(total_loss / iters) print("ppl ", batch_id, ppl[0]) + if args.ce: + print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): @@ -407,6 +416,8 @@ def train_ptb_lm(): print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("ppl ", epoch_id, ppl[0]) + if args.ce: + print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) eval(ptb_model, test_data)