提交 fcf1677b 编写于 作者: Z zhengya01

add ctr ce

上级 6a69dd8b
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${face_detection:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --is_local 1 --cloud_train 0 --train_data_path data/raw/train.txt --enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
#sys.path.append(os.environ['ceroot'])
sys.path.append('./')
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
train_auc_val_card1_kpi = AccKpi('train_auc_val_card1', 0.08, 0)
train_batch_auc_val_card1_kpi = AccKpi('train_batch_auc_val_card1', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
train_auc_val_card1_kpi,
train_batch_auc_val_card1_kpi
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -107,12 +107,27 @@ def parse_args(): ...@@ -107,12 +107,27 @@ def parse_args():
type=int, type=int,
default=1, default=1,
help='The num of trianers, (default: 1)') help='The num of trianers, (default: 1)')
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
parser.add_argument(
'--num_devices',
type=int,
default=0,
help='The num of devices, (default: 1)')
return parser.parse_args() return parser.parse_args()
def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
trainer_num, trainer_id): trainer_num, trainer_id):
if args.enable_ce:
SEED = 102
train_program.random_seed = SEED
fluid.default_startup_program().random_seed = SEED
dataset = reader.CriteoDataset(args.sparse_feature_dim) dataset = reader.CriteoDataset(args.sparse_feature_dim)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -146,6 +161,7 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, ...@@ -146,6 +161,7 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
total_time = 0
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
pass_start = time.time() pass_start = time.time()
batch_id = 0 batch_id = 0
...@@ -169,10 +185,25 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, ...@@ -169,10 +185,25 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
py_reader.reset() py_reader.reset()
print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start)) print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start))
total_time += time.time() - pass_start
model_dir = args.model_output_dir + '/pass-' + str(pass_id) model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if args.trainer_id == 0: if args.trainer_id == 0:
fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe) fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)
# only for ce
if args.enable_ce:
gpu_num = get_cards(args)
epoch_idx = args.num_passes
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, loss_val/args.batch_size))
print("kpis\ttrain_auc_val_card%s\t%s" %
(gpu_num, auc_val))
print("kpis\ttrain_batch_auc_val_card%s\t%s" %
(gpu_num, batch_auc_val))
def train(): def train():
args = parse_args() args = parse_args()
...@@ -224,5 +255,14 @@ def train(): ...@@ -224,5 +255,14 @@ def train():
) )
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
if __name__ == '__main__': if __name__ == '__main__':
train() train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册