提交 fe7043f9 编写于 作者: X xiegegege 提交者: Kaipeng Deng

add tsm ce (#2318)

* add tsm ce

* add tsm ce

* add enable_ce in configs

* remove enable_ce in val config
上级 061b58e2
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi
train_cost_card1_kpi = CostKpi('train_cost_card1', 0.08, 0, actived=True, desc='train cost')
train_speed_card1_kpi = DurationKpi('train_speed_card1', 0.08, 0, actived=True, desc='train speed in one GPU card')
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.08, 0, actived=True, desc='train cost')
train_speed_card4_kpi = DurationKpi('train_speed_card4', 0.3, 0, actived=True, desc='train speed in four GPU card')
tracking_kpis = [train_cost_card1_kpi, train_speed_card1_kpi, train_cost_card4_kpi, train_speed_card4_kpi]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
......@@ -23,7 +23,8 @@ learning_rate_decay = 0.1
decay_epochs = [40, 60]
l2_weight_decay = 1e-4
momentum = 0.9
total_videos = 239781
total_videos = 239781
enable_ce = False
[VALID]
short_size = 256
......
......@@ -65,6 +65,7 @@ class KineticsReader(DataReader):
self.target_size = self.get_config_from_sec(mode, 'target_size')
self.num_reader_threads = self.get_config_from_sec(mode, 'num_reader_threads')
self.buf_size = self.get_config_from_sec(mode, 'buf_size')
self.enable_ce = self.get_config_from_sec(mode, 'enable_ce')
self.img_mean = np.array(cfg.MODEL.image_mean).reshape(
[3, 1, 1]).astype(np.float32)
......@@ -73,6 +74,9 @@ class KineticsReader(DataReader):
# set batch size and file list
self.batch_size = cfg[mode.upper()]['batch_size']
self.filelist = cfg[mode.upper()]['filelist']
if self.enable_ce:
random.seed(0)
np.random.seed(0)
def create_reader(self):
_reader = self._reader_creator(self.filelist, self.mode, seg_num=self.seg_num, seglen = self.seglen, \
......
......@@ -94,6 +94,7 @@ class Kinetics400Metrics(Metrics):
acc1, acc5 = self.calculator.calculate_metrics(loss, pred, label)
logger.info(info + '\tLoss: {},\ttop1_acc: {}, \ttop5_acc: {}'.format('%.6f' % loss, \
'%.2f' % acc1, '%.2f' % acc5))
return loss
def accumulate(self, loss, pred, label, info=''):
self.calculator.accumulate(loss, pred, label)
......
#!/bin/bash
export FLAGS_fast_eager_deletion_mode=1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
export CUDA_VISIBLE_DEVICES=0
python train.py --model_name="TSM" --config=./configs/tsm.txt --epoch=1 --log_interval=10 --batch_size=128 --enable_ce=True | python _ce.py
export CUDA_VISIBLE_DEVICES=0,1,2,3
python train.py --model_name="TSM" --config=./configs/tsm.txt --epoch=1 --log_interval=10 --batch_size=128 --enable_ce=True | python _ce.py
......@@ -113,12 +113,14 @@ def train_without_pyreader(exe, train_prog, train_exe, train_reader, train_feede
def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
train_fetch_list, train_metrics, epochs = 10, \
log_interval = 0, valid_interval = 0, \
save_dir = './', save_model_name = 'model', \
log_interval = 0, valid_interval = 0, save_dir = './', \
save_model_name = 'model', enable_ce = False, \
test_exe = None, test_pyreader = None, \
test_fetch_list = None, test_metrics = None):
if not train_pyreader:
logger.error("[TRAIN] get pyreader failed.")
epoch_periods = []
train_loss = 0
for epoch in range(epochs):
log_lr_and_step()
train_pyreader.start()
......@@ -136,7 +138,7 @@ def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
label = np.array(train_outs[-1])
if log_interval > 0 and (train_iter % log_interval == 0):
# eval here
train_metrics.calculate_and_log_out(loss, pred, label, \
train_loss = train_metrics.calculate_and_log_out(loss, pred, label, \
info = '[TRAIN] Epoch {}, iter {} '.format(epoch, train_iter))
train_iter += 1
except fluid.core.EOFException:
......@@ -152,7 +154,12 @@ def train_with_pyreader(exe, train_prog, train_exe, train_pyreader, \
finally:
epoch_period = []
train_pyreader.reset()
#only for ce
if enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
gpu_num = len(cards.split(","))
print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss))
print("kpis\ttrain_speed_card{}\t{}".format(gpu_num, np.mean(epoch_periods)))
def save_model(exe, program, save_dir, model_name, postfix=None):
model_path = os.path.join(save_dir, model_name + postfix)
......
......@@ -98,6 +98,11 @@ def parse_args():
type=int,
default=10,
help='mini-batch interval to log.')
parser.add_argument(
'--enable_ce',
type=bool,
default=False,
help='If set True, enable continuous evaluation job.')
args = parser.parse_args()
return args
......@@ -114,6 +119,9 @@ def train(args):
# build model
startup = fluid.Program()
train_prog = fluid.Program()
if args.enable_ce:
startup.random_seed = 1000
train_prog.random_seed = 1000
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
train_model.build_input(not args.no_use_pyreader)
......@@ -235,6 +243,7 @@ def train(args):
valid_interval=args.valid_interval,
save_dir=args.save_dir,
save_model_name=args.model_name,
enable_ce=args.enable_ce,
test_exe=valid_exe,
test_pyreader=valid_pyreader,
test_fetch_list=valid_fetch_list,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册