未验证 提交 c25124db 编写于 作者: D Divano 提交者: GitHub

add Resnet ce (#2502)

* add ce for dygraph mnist

* add ce for dygraph mnist

* del mnist_dygraph.py

* change mnist_dygraph to train

* fix print style

* add resnet

* fix ce bug

* fix ce decsription
上级 4bb42e25
#!/bin/bash
# This file is only used for continuous evaluation.
# dygraph single card
export FLAGS_cudnn_deterministic=True
export CUDA_VISIBLE_DEVICES=0
python train.py --ce --epoch 1 --batch_size 128 | python _ce.py
####this file is only used for continuous evaluation test!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc1 = AccKpi('train_acc1', 0.01, 0, actived=True, desc="train acc1")
train_acc5 = AccKpi('train_acc5', 0.01, 0, actived=True, desc="train acc5")
train_loss = CostKpi('train_loss', 0.01, 0, actived=True, desc="train loss")
test_acc1 = AccKpi('test_acc1', 0.01, 0, actived=True, desc='test acc1')
test_acc5 = AccKpi('test_acc5', 0.01, 0, actived=True, desc='test acc5')
test_loss = CostKpi('test_loss', 0.01, 0, actived=True, desc='test loss')
#train_speed_kpi = DurationKpi(
# 'train_speed',
# 0.05,
# 0,
# actived=True,
# unit_repr='seconds/image',
# desc='train speed in one GPU card')
tracking_kpis = [train_acc1, train_acc5, train_loss,
test_acc1, test_acc5, test_loss]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
...@@ -26,8 +26,6 @@ from paddle.fluid import framework ...@@ -26,8 +26,6 @@ from paddle.fluid import framework
import math import math
import sys import sys
batch_size = 32
epoch = 120
IMAGENET1000 = 1281167 IMAGENET1000 = 1281167
base_lr = 0.1 base_lr = 0.1
momentum_rate = 0.9 momentum_rate = 0.9
...@@ -35,18 +33,21 @@ l2_decay = 1e-4 ...@@ -35,18 +33,21 @@ l2_decay = 1e-4
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("Training for Mnist.") parser = argparse.ArgumentParser("Training for Resnet.")
parser.add_argument( parser.add_argument(
"--use_data_parallel", "--use_data_parallel",
type=ast.literal_eval, type=ast.literal_eval,
default=False, default=False,
help="The flag indicating whether to shuffle instances in each pass.") help="The flag indicating whether to shuffle instances in each pass.")
parser.add_argument("-e", "--epoch", default=120, type=int, help="set epoch")
parser.add_argument("-b", "--batch_size", default=32, type=int, help="set epoch")
parser.add_argument("--ce", action="store_true", help="run ce")
args = parser.parse_args() args = parser.parse_args()
return args return args
args = parse_args() args = parse_args()
batch_size = args.batch_size
def optimizer_setting(): def optimizer_setting():
...@@ -263,16 +264,28 @@ def eval(model, data): ...@@ -263,16 +264,28 @@ def eval(model, data):
print("test | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ print("test | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
( batch_id, total_loss / total_sample, \ ( batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample)) total_acc1 / total_sample, total_acc5 / total_sample))
if args.ce:
print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttest_loss\t%0.3f" % (total_loss / total_sample))
print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \ print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(total_loss / total_sample, \ (total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample)) total_acc1 / total_sample, total_acc5 / total_sample))
def train_resnet(): def train_resnet():
epoch = args.epoch
trainer_count = fluid.dygraph.parallel.Env().nranks trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
if args.use_data_parallel: if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context() strategy = fluid.dygraph.parallel.prepare_context()
...@@ -340,24 +353,27 @@ def train_resnet(): ...@@ -340,24 +353,27 @@ def train_resnet():
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
resnet.clear_gradients() resnet.clear_gradients()
framework._dygraph_tracer_._clear_ops()
total_loss += dy_out total_loss += dy_out
total_acc1 += acc_top1.numpy() total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy() total_acc5 += acc_top5.numpy()
total_sample += 1 total_sample += 1
#print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
if batch_id % 10 == 0: if batch_id % 10 == 0:
print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
( eop, batch_id, total_loss / total_sample, \ ( eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample)) total_acc1 / total_sample, total_acc5 / total_sample))
if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(eop, batch_id, total_loss / total_sample, \ (eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample)) total_acc1 / total_sample, total_acc5 / total_sample))
resnet.eval() resnet.eval()
eval(resnet, test_reader) eval(resnet, test_reader)
fluid.dygraph.save_persistables(resnet.state_dict(), 'resnet_params')
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册