未验证 提交 7e82766e 编写于 作者: B Bai Yifan 提交者: GitHub

Continuous Evaluation for image classification. (#1118)

* Continuous Evaluation for image classification.

* add 4card kpis

* update batch_size
上级 a66a83f7
# saved model
output/
# coco and pascalvoc data
data/ILSVRC2012/ILSVRC2012_img_val.tar
data/ILSVRC2012/ILSVRC2012_img_train.tar
data/ILSVRC2012/ImageNet_label.tgz
data/ILSVRC2012/train_list.txt
data/ILSVRC2012/val_list.txt
#!/bin/bash
# This file is only used for continuous evaluation.
cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
cudaid=${object_detection_cudaid:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_kpi = AccKpi(
'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_kpi = CostKpi('train_cost', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_kpi = AccKpi(
'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost')
train_speed_kpi = AccKpi(
'train_speed',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in one GPU card')
train_acc_top1_card4_kpi = AccKpi(
'train_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_card4_kpi = AccKpi(
'train_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_card4_kpi = CostKpi(
'train_cost_kpi', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_card4_kpi = AccKpi(
'test_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_card4_kpi = AccKpi(
'test_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_card4_kpi = CostKpi(
'test_cost_card4', 1.0, 0, actived=True, desc='train cost')
train_speed_card4_kpi = AccKpi(
'train_speed_card4',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in four GPU card')
tracking_kpis = [
train_acc_top1_kpi, train_acc_top5_kpi, train_cost_kpi, test_acc_top1_kpi,
test_acc_top5_kpi, test_cost_kpi, train_speed_kpi, train_acc_top1_card4_kpi,
train_acc_top5_card4_kpi, train_cost_card4_kpi, test_acc_top1_card4_kpi,
test_acc_top5_card4_kpi, test_cost_card4_kpi, train_speed_card4_kpi
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print log
print("****")
log_to_ce(log)
......@@ -11,6 +11,7 @@ train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"dropout_seed": None,
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
......@@ -101,7 +102,9 @@ class SE_ResNeXt():
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
# do not set seed when traning, it is only used for debug
drop = fluid.layers.dropout(
x=pool, dropout_prob=0.5, seed=self.params["dropout_seed"])
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop,
size=class_dim,
......
......@@ -4,6 +4,7 @@ import time
import sys
import paddle
import paddle.fluid as fluid
import paddle.dataset.flowers as flowers
import models
import reader
import argparse
......@@ -28,6 +29,7 @@ add_arg('checkpoint', str, None, "Whether to resume chec
add_arg('lr', float, 0.1, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('model', str, "SE_ResNeXt50_32x4d", "Set the network to use.")
add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job.")
# yapf: enable
model_list = [m for m in dir(models) if "__" not in m]
......@@ -100,6 +102,9 @@ def train(args):
# model definition
model = models.__dict__[model_name]()
if args.enable_ce:
assert model_name == "SE_ResNeXt50_32x4d"
if model_name is "GoogleNet":
out0, out1, out2 = model.net(input=image, class_dim=class_dim)
cost0 = fluid.layers.cross_entropy(input=out0, label=label)
......@@ -129,6 +134,8 @@ def train(args):
params["num_epochs"] = args.num_epochs
params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy
if args.enable_ce:
params["dropout_seed"] = 10
# initialize optimizer
optimizer = optimizer_setting(params)
......@@ -137,6 +144,9 @@ def train(args):
if with_memory_optimization:
fluid.memory_optimize(fluid.default_main_program())
if args.enable_ce:
fluid.default_startup_program().random_seed = 1000
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -153,8 +163,20 @@ def train(args):
train_batch_size = args.batch_size
test_batch_size = 16
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
if not args.enable_ce:
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
else:
# use flowers dataset for CE and set use_xmap False to avoid disorder data
# but it is time consuming. For faster speed, need another dataset.
import random
random.seed(0)
train_reader = paddle.batch(
flowers.train(use_xmap=False), batch_size=train_batch_size)
test_reader = paddle.batch(
flowers.test(use_xmap=False), batch_size=test_batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_exe = fluid.ParallelExecutor(
......@@ -162,9 +184,12 @@ def train(args):
fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
gpu_nums = len(gpu.split(","))
for pass_id in range(params["num_epochs"]):
train_info = [[], [], []]
test_info = [[], [], []]
train_time = []
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
......@@ -176,6 +201,7 @@ def train(args):
train_info[0].append(loss)
train_info[1].append(acc1)
train_info[2].append(acc5)
train_time.append(period)
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
......@@ -187,6 +213,7 @@ def train(args):
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
train_speed = np.array(train_time).mean() / train_batch_size
cnt = 0
for test_batch_id, data in enumerate(test_reader()):
t1 = time.time()
......@@ -226,6 +253,36 @@ def train(args):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
# This is for continuous evaluation only
if args.enable_ce and pass_id == args.num_epochs - 1:
if gpu_nums == 1:
# Use the last cost/acc for training
print("kpis train_cost %s" % train_loss)
print("kpis train_acc_top1 %s" % train_acc1)
print("kpis train_acc_top5 %s" % train_acc5)
# Use the mean cost/acc for testing
print("kpis test_cost %s" % test_loss)
print("kpis test_acc_top1 %s" % test_acc1)
print("kpis test_acc_top5 %s" % test_acc5)
print("kpis train_speed %s" % train_speed)
else:
# Use the last cost/acc for training
print("kpis train_cost_card%s %s" %
(gpu_nums, train_loss))
print("kpis train_acc_top1_card%s %s" %
(gpu_nums, train_acc1))
print("kpis train_acc_top5_card%s %s" %
(gpu_nums, train_acc5))
# Use the mean cost/acc for testing
print("kpis test_cost_card%s %s" %
(gpu_nums, test_loss))
print("kpis test_acc_top1_card%s %s" %
(gpu_nums, test_acc1))
print("kpis test_acc_top5_card%s %s" %
(gpu_nums, test_acc5))
print("kpis train_speed_card%s %s" %
(gpu_nums, train_speed))
def main():
args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册