提交 7b7385e3 编写于 作者: Z zhengya01

add ce

上级 e6ed31ef
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${video_classification:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
export FLAGS_fraction_of_gpu_memory_to_use=0.5
FLAGS_benchmark=true python train.py --batch_size=16 --total_videos=9537 --class_dim=101 --num_epochs=1 --image_shape=3,224,224 --model_save_dir=output/ --with_mem_opt=True --lr_init=0.01 --num_layers=50 --seg_num=7 --enable_ce=True | python _ce.py
#export FLAGS_fraction_of_gpu_memory_to_use=0.92
cudaid=${video_classification_4:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --batch_size=16 --total_videos=9537 --class_dim=101 --num_epochs=1 --image_shape=3,224,224 --model_save_dir=output/ --with_mem_opt=True --lr_init=0.01 --num_layers=50 --seg_num=7 --enable_ce=True | python _ce.py
exit 0
cudaid=${video_classification_8:=0,1,2,3,4,5,6,7} # use 0,1,2,3,4,5,6,7 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --batch_size=16 --total_videos=9537 --class_dim=101 --num_epochs=1 --image_shape=3,224,224 --model_save_dir=output/ --with_mem_opt=True --lr_init=0.01 --num_layers=50 --seg_num=7 --enable_ce=True | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
os.environ['ceroot'] = "./"
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0)
each_pass_duration_card8_kpi = DurationKpi('each_pass_duration_card8', 0.08, 0, actived=True)
train_loss_card8_kpi = CostKpi('train_loss_card8', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
each_pass_duration_card4_kpi,
train_loss_card4_kpi,
each_pass_duration_card8_kpi,
train_loss_card8_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -14,6 +14,7 @@ import paddle ...@@ -14,6 +14,7 @@ import paddle
from PIL import Image, ImageEnhance from PIL import Image, ImageEnhance
random.seed(0) random.seed(0)
np.random.seed(0)
THREAD = 8 THREAD = 8
BUF_SIZE = 1024 BUF_SIZE = 1024
...@@ -27,6 +28,7 @@ img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) ...@@ -27,6 +28,7 @@ img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
python_ver = sys.version_info python_ver = sys.version_info
def imageloader(buf): def imageloader(buf):
if isinstance(buf, str): if isinstance(buf, str):
img = Image.open(StringIO(buf)) img = Image.open(StringIO(buf))
...@@ -149,7 +151,7 @@ def decode_pickle(sample, mode, seg_num, short_size, target_size): ...@@ -149,7 +151,7 @@ def decode_pickle(sample, mode, seg_num, short_size, target_size):
imgs -= img_mean imgs -= img_mean
imgs /= img_std imgs /= img_std
if mode == 'train' or mode == 'test': if mode == 'train' or mode == 'test' or mode == 'train_ce':
return imgs, label return imgs, label
elif mode == 'infer': elif mode == 'infer':
return imgs, vid return imgs, vid
...@@ -208,3 +210,13 @@ def infer(seg_num): ...@@ -208,3 +210,13 @@ def infer(seg_num):
seg_num=seg_num, seg_num=seg_num,
short_size=256, short_size=256,
target_size=224) target_size=224)
def train_ce(seg_num):
return _reader_creator(
TRAIN_LIST,
'train_ce',
shuffle=False,
seg_num=seg_num,
short_size=256,
target_size=224)
...@@ -26,6 +26,9 @@ add_arg('model_save_dir', str, "output", "Model save directory.") ...@@ -26,6 +26,9 @@ add_arg('model_save_dir', str, "output", "Model save directory.")
add_arg('pretrained_model', str, None, "Whether to use pretrained model.") add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
add_arg('total_videos', int, 9537, "Training video number.") add_arg('total_videos', int, 9537, "Training video number.")
add_arg('lr_init', float, 0.01, "Set initial learning rate.") add_arg('lr_init', float, 0.01, "Set initial learning rate.")
add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job.")
add_arg('num_devices', int, 1, "Training video number.")
add_arg('num_batches', int, 100, "Training video number.")
# yapf: enable # yapf: enable
...@@ -55,6 +58,11 @@ def train(args): ...@@ -55,6 +58,11 @@ def train(args):
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
if args.enable_ce:
SEED = 102
fluid.default_main_program().random_seed = SEED
fluid.default_startup_program().random_seed = SEED
# for test # for test
inference_program = fluid.default_main_program().clone(for_test=True) inference_program = fluid.default_main_program().clone(for_test=True)
...@@ -92,6 +100,9 @@ def train(args): ...@@ -92,6 +100,9 @@ def train(args):
# reader # reader
train_reader = paddle.batch(reader.train(seg_num), batch_size=batch_size, drop_last=True) train_reader = paddle.batch(reader.train(seg_num), batch_size=batch_size, drop_last=True)
if args.enable_ce:
train_reader = paddle.batch(reader.train_ce(seg_num), batch_size=batch_size, drop_last=False)
# test in single GPU # test in single GPU
test_reader = paddle.batch(reader.test(seg_num), batch_size=batch_size / 16) test_reader = paddle.batch(reader.test(seg_num), batch_size=batch_size / 16)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
...@@ -100,15 +111,20 @@ def train(args): ...@@ -100,15 +111,20 @@ def train(args):
fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
total_time = 0
# train # train
for pass_id in range(num_epochs): for pass_id in range(num_epochs):
train_info = [[], [], []] train_info = [[], [], []]
test_info = [[], [], []] test_info = [[], [], []]
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if args.enable_ce:
if batch_id > args.num_batches:
break
t1 = time.time() t1 = time.time()
loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
t2 = time.time() t2 = time.time()
period = t2 - t1 period = t2 - t1
total_time += period
loss = np.mean(np.array(loss)) loss = np.mean(np.array(loss))
acc1 = np.mean(np.array(acc1)) acc1 = np.mean(np.array(acc1))
acc5 = np.mean(np.array(acc5)) acc5 = np.mean(np.array(acc5))
...@@ -130,6 +146,8 @@ def train(args): ...@@ -130,6 +146,8 @@ def train(args):
# test # test
cnt = 0 cnt = 0
for batch_id, data in enumerate(test_reader()): for batch_id, data in enumerate(test_reader()):
if args.enable_ce and batch_id > 3:
break
t1 = time.time() t1 = time.time()
loss, acc1, acc5 = exe.run(inference_program, loss, acc1, acc5 = exe.run(inference_program,
fetch_list=fetch_list, fetch_list=fetch_list,
...@@ -169,6 +187,23 @@ def train(args): ...@@ -169,6 +187,23 @@ def train(args):
os.makedirs(model_path) os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path) fluid.io.save_persistables(exe, model_path)
if args.enable_ce:
gpu_num = get_cards(args)
epoch_idx = num_epochs
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, train_loss))
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
def main(): def main():
args = parser.parse_args() args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册