提交 e07edbcb 编写于 作者: Z zhengya01 提交者: hutuxian

add ce for gnn (#2003)

上级 5a81d8c2
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${gnn:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python -u train.py --use_cuda 1 --epoch_num 5 --enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
train_acc_card1_kpi = AccKpi('train_acc_card1', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
train_acc_card1_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -16,6 +16,7 @@ import numpy as np ...@@ -16,6 +16,7 @@ import numpy as np
import os import os
from functools import partial from functools import partial
import logging import logging
import time
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import argparse import argparse
...@@ -55,11 +56,19 @@ def parse_args(): ...@@ -55,11 +56,19 @@ def parse_args():
'--use_cuda', type=int, default=0, help='whether to use gpu') '--use_cuda', type=int, default=0, help='whether to use gpu')
parser.add_argument( parser.add_argument(
'--use_parallel', type=int, default=1, help='whether to use parallel executor') '--use_parallel', type=int, default=1, help='whether to use parallel executor')
parser.add_argument(
'--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
return parser.parse_args() return parser.parse_args()
def train(): def train():
args = parse_args() args = parse_args()
if args.enable_ce:
SEED = 102
fluid.default_main_program().random_seed = SEED
fluid.default_startup_program().random_seed = SEED
batch_size = args.batch_size batch_size = args.batch_size
items_num = reader.read_config(args.config_path) items_num = reader.read_config(args.config_path)
loss, acc = network.network(batch_size, items_num, args.hidden_size, loss, acc = network.network(batch_size, items_num, args.hidden_size,
...@@ -102,6 +111,9 @@ def train(): ...@@ -102,6 +111,9 @@ def train():
logger.info("begin train") logger.info("begin train")
total_time = []
ce_info = []
start_time = time.time()
loss_sum = 0.0 loss_sum = 0.0
acc_sum = 0.0 acc_sum = 0.0
global_step = 0 global_step = 0
...@@ -116,16 +128,45 @@ def train(): ...@@ -116,16 +128,45 @@ def train():
epoch_sum.append(res[0]) epoch_sum.append(res[0])
global_step += 1 global_step += 1
if global_step % PRINT_STEP == 0: if global_step % PRINT_STEP == 0:
ce_info.append([loss_sum / PRINT_STEP, acc_sum / PRINT_STEP])
total_time.append(time.time() - start_time)
logger.info("global_step: %d, loss: %.4lf, train_acc: %.4lf" % ( logger.info("global_step: %d, loss: %.4lf, train_acc: %.4lf" % (
global_step, loss_sum / PRINT_STEP, acc_sum / PRINT_STEP)) global_step, loss_sum / PRINT_STEP, acc_sum / PRINT_STEP))
loss_sum = 0.0 loss_sum = 0.0
acc_sum = 0.0 acc_sum = 0.0
start_time = time.time()
logger.info("epoch loss: %.4lf" % (np.mean(epoch_sum))) logger.info("epoch loss: %.4lf" % (np.mean(epoch_sum)))
save_dir = args.model_path + "/epoch_" + str(i) save_dir = args.model_path + "/epoch_" + str(i)
fetch_vars = [loss, acc] fetch_vars = [loss, acc]
fluid.io.save_inference_model(save_dir, feed_list, fetch_vars, exe) fluid.io.save_inference_model(save_dir, feed_list, fetch_vars, exe)
logger.info("model saved in " + save_dir) logger.info("model saved in " + save_dir)
# only for ce
if args.enable_ce:
gpu_num = get_cards(args)
ce_loss = 0
ce_acc = 0
ce_time = 0
try:
ce_loss = ce_info[-1][0]
ce_acc = ce_info[-1][1]
ce_time = total_time[-1]
except:
print("ce info error")
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, ce_time))
print("kpis\ttrain_loss_card%s\t%f" %
(gpu_num, ce_loss))
print("kpis\ttrain_acc_card%s\t%f" %
(gpu_num, ce_acc))
def get_cards(args):
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
if __name__ == "__main__": if __name__ == "__main__":
train() train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册