未验证 提交 7e82766e 编写于 作者: B Bai Yifan 提交者: GitHub

Continuous Evaluation for image classification. (#1118)

* Continuous Evaluation for image classification.

* add 4card kpis

* update batch_size
上级 a66a83f7
# saved model
output/
# coco and pascalvoc data
data/ILSVRC2012/ILSVRC2012_img_val.tar
data/ILSVRC2012/ILSVRC2012_img_train.tar
data/ILSVRC2012/ImageNet_label.tgz
data/ILSVRC2012/train_list.txt
data/ILSVRC2012/val_list.txt
#!/bin/bash
# This file is only used for continuous evaluation.
cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
cudaid=${object_detection_cudaid:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_kpi = AccKpi(
'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_kpi = CostKpi('train_cost', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_kpi = AccKpi(
'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost')
train_speed_kpi = AccKpi(
'train_speed',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in one GPU card')
train_acc_top1_card4_kpi = AccKpi(
'train_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_card4_kpi = AccKpi(
'train_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_card4_kpi = CostKpi(
'train_cost_kpi', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_card4_kpi = AccKpi(
'test_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_card4_kpi = AccKpi(
'test_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_card4_kpi = CostKpi(
'test_cost_card4', 1.0, 0, actived=True, desc='train cost')
train_speed_card4_kpi = AccKpi(
'train_speed_card4',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in four GPU card')
tracking_kpis = [
train_acc_top1_kpi, train_acc_top5_kpi, train_cost_kpi, test_acc_top1_kpi,
test_acc_top5_kpi, test_cost_kpi, train_speed_kpi, train_acc_top1_card4_kpi,
train_acc_top5_card4_kpi, train_cost_card4_kpi, test_acc_top1_card4_kpi,
test_acc_top5_card4_kpi, test_cost_card4_kpi, train_speed_card4_kpi
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print log
print("****")
log_to_ce(log)
...@@ -11,6 +11,7 @@ train_parameters = { ...@@ -11,6 +11,7 @@ train_parameters = {
"input_size": [3, 224, 224], "input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406], "input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225], "input_std": [0.229, 0.224, 0.225],
"dropout_seed": None,
"learning_strategy": { "learning_strategy": {
"name": "piecewise_decay", "name": "piecewise_decay",
"batch_size": 256, "batch_size": 256,
...@@ -101,7 +102,9 @@ class SE_ResNeXt(): ...@@ -101,7 +102,9 @@ class SE_ResNeXt():
pool = fluid.layers.pool2d( pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True) input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.5) # do not set seed when traning, it is only used for debug
drop = fluid.layers.dropout(
x=pool, dropout_prob=0.5, seed=self.params["dropout_seed"])
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop, out = fluid.layers.fc(input=drop,
size=class_dim, size=class_dim,
......
...@@ -4,6 +4,7 @@ import time ...@@ -4,6 +4,7 @@ import time
import sys import sys
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.dataset.flowers as flowers
import models import models
import reader import reader
import argparse import argparse
...@@ -28,6 +29,7 @@ add_arg('checkpoint', str, None, "Whether to resume chec ...@@ -28,6 +29,7 @@ add_arg('checkpoint', str, None, "Whether to resume chec
add_arg('lr', float, 0.1, "set learning rate.") add_arg('lr', float, 0.1, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('model', str, "SE_ResNeXt50_32x4d", "Set the network to use.") add_arg('model', str, "SE_ResNeXt50_32x4d", "Set the network to use.")
add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job.")
# yapf: enable # yapf: enable
model_list = [m for m in dir(models) if "__" not in m] model_list = [m for m in dir(models) if "__" not in m]
...@@ -100,6 +102,9 @@ def train(args): ...@@ -100,6 +102,9 @@ def train(args):
# model definition # model definition
model = models.__dict__[model_name]() model = models.__dict__[model_name]()
if args.enable_ce:
assert model_name == "SE_ResNeXt50_32x4d"
if model_name is "GoogleNet": if model_name is "GoogleNet":
out0, out1, out2 = model.net(input=image, class_dim=class_dim) out0, out1, out2 = model.net(input=image, class_dim=class_dim)
cost0 = fluid.layers.cross_entropy(input=out0, label=label) cost0 = fluid.layers.cross_entropy(input=out0, label=label)
...@@ -129,6 +134,8 @@ def train(args): ...@@ -129,6 +134,8 @@ def train(args):
params["num_epochs"] = args.num_epochs params["num_epochs"] = args.num_epochs
params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy params["learning_strategy"]["name"] = args.lr_strategy
if args.enable_ce:
params["dropout_seed"] = 10
# initialize optimizer # initialize optimizer
optimizer = optimizer_setting(params) optimizer = optimizer_setting(params)
...@@ -137,6 +144,9 @@ def train(args): ...@@ -137,6 +144,9 @@ def train(args):
if with_memory_optimization: if with_memory_optimization:
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.enable_ce:
fluid.default_startup_program().random_seed = 1000
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -153,8 +163,20 @@ def train(args): ...@@ -153,8 +163,20 @@ def train(args):
train_batch_size = args.batch_size train_batch_size = args.batch_size
test_batch_size = 16 test_batch_size = 16
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) if not args.enable_ce:
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
else:
# use flowers dataset for CE and set use_xmap False to avoid disorder data
# but it is time consuming. For faster speed, need another dataset.
import random
random.seed(0)
train_reader = paddle.batch(
flowers.train(use_xmap=False), batch_size=train_batch_size)
test_reader = paddle.batch(
flowers.test(use_xmap=False), batch_size=test_batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
...@@ -162,9 +184,12 @@ def train(args): ...@@ -162,9 +184,12 @@ def train(args):
fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
gpu_nums = len(gpu.split(","))
for pass_id in range(params["num_epochs"]): for pass_id in range(params["num_epochs"]):
train_info = [[], [], []] train_info = [[], [], []]
test_info = [[], [], []] test_info = [[], [], []]
train_time = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
t1 = time.time() t1 = time.time()
loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
...@@ -176,6 +201,7 @@ def train(args): ...@@ -176,6 +201,7 @@ def train(args):
train_info[0].append(loss) train_info[0].append(loss)
train_info[1].append(acc1) train_info[1].append(acc1)
train_info[2].append(acc5) train_info[2].append(acc5)
train_time.append(period)
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \ print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}" acc1 {3}, acc5 {4} time {5}"
...@@ -187,6 +213,7 @@ def train(args): ...@@ -187,6 +213,7 @@ def train(args):
train_loss = np.array(train_info[0]).mean() train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean() train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean() train_acc5 = np.array(train_info[2]).mean()
train_speed = np.array(train_time).mean() / train_batch_size
cnt = 0 cnt = 0
for test_batch_id, data in enumerate(test_reader()): for test_batch_id, data in enumerate(test_reader()):
t1 = time.time() t1 = time.time()
...@@ -226,6 +253,36 @@ def train(args): ...@@ -226,6 +253,36 @@ def train(args):
os.makedirs(model_path) os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path) fluid.io.save_persistables(exe, model_path)
# This is for continuous evaluation only
if args.enable_ce and pass_id == args.num_epochs - 1:
if gpu_nums == 1:
# Use the last cost/acc for training
print("kpis train_cost %s" % train_loss)
print("kpis train_acc_top1 %s" % train_acc1)
print("kpis train_acc_top5 %s" % train_acc5)
# Use the mean cost/acc for testing
print("kpis test_cost %s" % test_loss)
print("kpis test_acc_top1 %s" % test_acc1)
print("kpis test_acc_top5 %s" % test_acc5)
print("kpis train_speed %s" % train_speed)
else:
# Use the last cost/acc for training
print("kpis train_cost_card%s %s" %
(gpu_nums, train_loss))
print("kpis train_acc_top1_card%s %s" %
(gpu_nums, train_acc1))
print("kpis train_acc_top5_card%s %s" %
(gpu_nums, train_acc5))
# Use the mean cost/acc for testing
print("kpis test_cost_card%s %s" %
(gpu_nums, test_loss))
print("kpis test_acc_top1_card%s %s" %
(gpu_nums, test_acc1))
print("kpis test_acc_top5_card%s %s" %
(gpu_nums, test_acc5))
print("kpis train_speed_card%s %s" %
(gpu_nums, train_speed))
def main(): def main():
args = parser.parse_args() args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册