提交 7b268255 编写于 作者: R root

add ce

上级 cbe656e0
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True)
train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0)
train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True)
train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0)
train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_avg_cost_card1_kpi,
train_avg_acc_card1_kpi,
each_pass_duration_card4_kpi,
train_avg_cost_card4_kpi,
train_avg_acc_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -33,9 +33,12 @@ parser = argparse.ArgumentParser(description=__doc__) ...@@ -33,9 +33,12 @@ parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--model_name', type=str, default='cdssmNet', help="Which model to train") parser.add_argument('--model_name', type=str, default='cdssmNet', help="Which model to train")
parser.add_argument('--config', type=str, default='cdssm_base', help="The global config setting") parser.add_argument('--config', type=str, default='cdssm_base', help="The global config setting")
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset') DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')
SEED = 102
def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader, fetch_list, feeder, metric_type): def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader, fetch_list, feeder, metric_type):
""" """
evaluate on test/dev dataset evaluate on test/dev dataset
...@@ -139,6 +142,12 @@ def train_and_evaluate(train_reader, ...@@ -139,6 +142,12 @@ def train_and_evaluate(train_reader,
else: else:
feeder = fluid.DataFeeder(feed_list=[q1, q2, mask1, mask2, label], place=place) feeder = fluid.DataFeeder(feed_list=[q1, q2, mask1, mask2, label], place=place)
# only for ce
args = parser.parse_args()
if args.enable_ce:
fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED
# logging param info # logging param info
for param in fluid.default_main_program().global_block().all_parameters(): for param in fluid.default_main_program().global_block().all_parameters():
print("param name: %s; param shape: %s" % (param.name, param.shape)) print("param name: %s; param shape: %s" % (param.name, param.shape))
...@@ -167,8 +176,12 @@ def train_and_evaluate(train_reader, ...@@ -167,8 +176,12 @@ def train_and_evaluate(train_reader,
metric_type=global_config.metric_type) metric_type=global_config.metric_type)
# start training # start training
total_time = 0.0
print("[%s] Start Training" % time.asctime(time.localtime(time.time()))) print("[%s] Start Training" % time.asctime(time.localtime(time.time())))
for epoch_id in range(global_config.epoch_num): for epoch_id in range(global_config.epoch_num):
# only for ce
epoch_idx = epoch_id + 1
data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
batch_id = 0 batch_id = 0
epoch_begin_time = time.time() epoch_begin_time = time.time()
...@@ -191,6 +204,9 @@ def train_and_evaluate(train_reader, ...@@ -191,6 +204,9 @@ def train_and_evaluate(train_reader,
avg_cost = total_cost / data_count avg_cost = total_cost / data_count
avg_acc = total_acc / data_count avg_acc = total_acc / data_count
epoch_end_time = time.time()
total_time += epoch_end_time - epoch_begin_time
print("") print("")
print("[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f, epoch_time_cost: %f" % ( print("[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f, epoch_time_cost: %f" % (
...@@ -198,6 +214,19 @@ def train_and_evaluate(train_reader, ...@@ -198,6 +214,19 @@ def train_and_evaluate(train_reader,
epoch_id, avg_cost, avg_acc, epoch_id, avg_cost, avg_acc,
time.time() - epoch_begin_time)) time.time() - epoch_begin_time))
# only for ce
if epoch_id == global_config.epoch_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards(args)
print("kpis\teach_pass_duration_card%s\t%s" % \
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_avg_cost_card%s\t%s" %
(gpu_num, avg_cost[0]))
print("kpis\ttrain_avg_acc_card%s\t%s" %
(gpu_num, avg_acc[0]))
epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id) epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id)
fluid.io.save_inference_model(epoch_model, ["question1", "question2", "label"], acc, exe) fluid.io.save_inference_model(epoch_model, ["question1", "question2", "label"], acc, exe)
...@@ -267,5 +296,15 @@ def main(): ...@@ -267,5 +296,15 @@ def main():
use_cuda=global_config.use_cuda, use_cuda=global_config.use_cuda,
parallel=False) parallel=False)
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册