diff --git a/dygraph/mnist/.run_ce.sh b/dygraph/mnist/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..cfae7788f4ccf6659877d11ea6b51e1373be2452 --- /dev/null +++ b/dygraph/mnist/.run_ce.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. +# dygraph single card +export FLAGS_cudnn_deterministic=True +export CUDA_VISIBLE_DEVICES=0 +python train.py --ce --epoch 1 | python _ce.py + diff --git a/dygraph/mnist/README.md b/dygraph/mnist/README.md index 9b0bffaa275efa3c1b005b1d7ea4759fc7a9168d..d9cbb99ca9add2ce75cf998356122e4a083c655a 100644 --- a/dygraph/mnist/README.md +++ b/dygraph/mnist/README.md @@ -15,11 +15,11 @@ ## 训练 教程中使用`paddle.dataset.mnist`数据集作为训练数据,可以通过如下的方式启动训练: ``` -env CUDA_VISIBLE_DEVICES=0 python mnist_dygraph.py +env CUDA_VISIBLE_DEVICES=0 python train.py ``` Paddle动态图支持多进程多卡进行模型训练,启动训练的方式: ``` -python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog mnist_dygraph.py --use_data_parallel 1 +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1 ``` 此时,程序会将每个进程的输出log导入到`./mylog`路径下: ``` diff --git a/dygraph/mnist/_ce.py b/dygraph/mnist/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..aaa34525862308b616ff71d06d82097ca8140f32 --- /dev/null +++ b/dygraph/mnist/_ce.py @@ -0,0 +1,65 @@ +####this file is only used for continuous evaluation test! +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +test_acc = AccKpi('test_acc', 0.001, 0, actived=True, desc="test acc") +test_cost = CostKpi('test_cost', 0.001, 0, actived=True, desc='test cost') +#train_speed_kpi = DurationKpi( +# 'train_speed', +# 0.05, +# 0, +# actived=True, +# unit_repr='seconds/image', +# desc='train speed in one GPU card') +tracking_kpis = [test_acc, test_cost] + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/dygraph/mnist/mnist_dygraph.py b/dygraph/mnist/train.py similarity index 93% rename from dygraph/mnist/mnist_dygraph.py rename to dygraph/mnist/train.py index d80a307c70618ed743bde11be62082b347a9957e..f8f095c5680b1f93cd9ec5767c798c50cfcc1f66 100644 --- a/dygraph/mnist/mnist_dygraph.py +++ b/dygraph/mnist/train.py @@ -32,6 +32,8 @@ def parse_args(): type=ast.literal_eval, default=False, help="The flag indicating whether to shuffle instances in each pass.") + parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch") + parser.add_argument("--ce", action="store_true", help="run ce") args = parser.parse_args() return args @@ -170,13 +172,20 @@ def inference_mnist(): def train_mnist(args): - epoch_num = 5 + epoch_num = args.epoch BATCH_SIZE = 64 trainer_count = fluid.dygraph.parallel.Env().nranks place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ if args.use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() mnist = MNIST("mnist") @@ -226,6 +235,9 @@ def train_mnist(args): mnist.eval() test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE) mnist.train() + if args.ce: + print("kpis\ttest_acc\t%s" % test_acc) + print("kpis\ttest_cost\t%s" % test_cost) print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format( epoch, test_cost, test_acc))