未验证 提交 83b367d7 编写于 作者: D Divano 提交者: GitHub

add dygraph mnist CE (#2453)

* add ce for dygraph mnist

* add ce for dygraph mnist

* del mnist_dygraph.py

* change mnist_dygraph to train

* fix print style
上级 dbc27b84
#!/bin/bash
# This file is only used for continuous evaluation.
# dygraph single card
export FLAGS_cudnn_deterministic=True
export CUDA_VISIBLE_DEVICES=0
python train.py --ce --epoch 1 | python _ce.py
......@@ -15,11 +15,11 @@
## 训练
教程中使用`paddle.dataset.mnist`数据集作为训练数据,可以通过如下的方式启动训练:
```
env CUDA_VISIBLE_DEVICES=0 python mnist_dygraph.py
env CUDA_VISIBLE_DEVICES=0 python train.py
```
Paddle动态图支持多进程多卡进行模型训练,启动训练的方式:
```
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog mnist_dygraph.py --use_data_parallel 1
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1
```
此时,程序会将每个进程的输出log导入到`./mylog`路径下:
```
......
####this file is only used for continuous evaluation test!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
test_acc = AccKpi('test_acc', 0.001, 0, actived=True, desc="test acc")
test_cost = CostKpi('test_cost', 0.001, 0, actived=True, desc='test cost')
#train_speed_kpi = DurationKpi(
# 'train_speed',
# 0.05,
# 0,
# actived=True,
# unit_repr='seconds/image',
# desc='train speed in one GPU card')
tracking_kpis = [test_acc, test_cost]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
......@@ -32,6 +32,8 @@ def parse_args():
type=ast.literal_eval,
default=False,
help="The flag indicating whether to shuffle instances in each pass.")
parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch")
parser.add_argument("--ce", action="store_true", help="run ce")
args = parser.parse_args()
return args
......@@ -170,13 +172,20 @@ def inference_mnist():
def train_mnist(args):
epoch_num = 5
epoch_num = args.epoch
BATCH_SIZE = 64
trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
if args.use_data_parallel else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
mnist = MNIST("mnist")
......@@ -226,6 +235,9 @@ def train_mnist(args):
mnist.eval()
test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE)
mnist.train()
if args.ce:
print("kpis\ttest_acc\t%s" % test_acc)
print("kpis\ttest_cost\t%s" % test_cost)
print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format(
epoch, test_cost, test_acc))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册