提交 4e2ed8ff 编写于 作者: Z zhengya01 提交者: kolinwei

add ce for similarity_net (#2349)

上级 c4701ae0
#!/usr/bin/env bash
export FLAGS_enable_parallel_graph=1
export FLAGS_sync_nccl_allreduce=1
export FLAGS_fraction_of_gpu_memory_to_use=0.95
TASK_NAME='simnet'
TRAIN_DATA_PATH=./data/train_pointwise_data
VALID_DATA_PATH=./data/test_pointwise_data
TEST_DATA_PATH=./data/test_pointwise_data
INFER_DATA_PATH=./data/infer_data
VOCAB_PATH=./data/term2id.dict
CKPT_PATH=./model_files
TEST_RESULT_PATH=./test_result
INFER_RESULT_PATH=./infer_result
TASK_MODE='pointwise'
CONFIG_PATH=./config/bow_pointwise.json
INIT_CHECKPOINT=./model_files/simnet_bow_pointwise_pretrained_model/
# run_train
train() {
python run_classifier.py \
--task_name ${TASK_NAME} \
--use_cuda True \
--do_train True \
--do_valid True \
--do_test True \
--do_infer False \
--batch_size 128 \
--train_data_dir ${TRAIN_DATA_PATH} \
--valid_data_dir ${VALID_DATA_PATH} \
--test_data_dir ${TEST_DATA_PATH} \
--infer_data_dir ${INFER_DATA_PATH} \
--output_dir ${CKPT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--epoch 3 \
--save_steps 1000 \
--validation_steps 100 \
--compute_accuracy False \
--lamda 0.958 \
--task_mode ${TASK_MODE} \
--enable_ce
}
export CUDA_VISIBLE_DEVICES=0
train | python _ce.py
sleep 20
export CUDA_VISIBLE_DEVICES=0,1,2,3
train | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
each_step_duration_simnet_card1 = DurationKpi('each_step_duration_simnet_card1', 0.03, 0, actived=True)
train_loss_simnet_card1 = CostKpi('train_loss_simnet_card1', 0.01, 0, actived=True)
each_step_duration_simnet_card4 = DurationKpi('each_step_duration_simnet_card4', 0.02, 0, actived=True)
train_loss_simnet_card4 = CostKpi('train_loss_simnet_card4', 0.01, 0, actived=True)
tracking_kpis = [
each_step_duration_simnet_card1,
train_loss_simnet_card1,
each_step_duration_simnet_card4,
train_loss_simnet_card4,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -73,6 +73,8 @@ run_type_g.add_arg( ...@@ -73,6 +73,8 @@ run_type_g.add_arg(
"When task_mode is pairwise, lamda is the threshold for calculating the accuracy." "When task_mode is pairwise, lamda is the threshold for calculating the accuracy."
) )
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
args = parser.parse_args() args = parser.parse_args()
...@@ -80,6 +82,11 @@ def train(conf_dict, args): ...@@ -80,6 +82,11 @@ def train(conf_dict, args):
""" """
train processic train processic
""" """
if args.enable_ce:
SEED = 102
fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED
# loading vocabulary # loading vocabulary
vocab = utils.load_vocab(args.vocab_path) vocab = utils.load_vocab(args.vocab_path)
# get vocab size # get vocab size
...@@ -202,6 +209,7 @@ def train(conf_dict, args): ...@@ -202,6 +209,7 @@ def train(conf_dict, args):
logging.info("start train process ...") logging.info("start train process ...")
# set global step # set global step
global_step = 0 global_step = 0
ce_info = []
for epoch_id in range(args.epoch): for epoch_id in range(args.epoch):
losses = [] losses = []
# Get batch data iterator # Get batch data iterator
...@@ -261,6 +269,21 @@ def train(conf_dict, args): ...@@ -261,6 +269,21 @@ def train(conf_dict, args):
end_time = time.time() end_time = time.time()
logging.info("epoch: %d, loss: %f, used time: %d sec" % logging.info("epoch: %d, loss: %f, used time: %d sec" %
(epoch_id, np.mean(losses), end_time - start_time)) (epoch_id, np.mean(losses), end_time - start_time))
ce_info.append([np.mean(losses), end_time - start_time])
if args.enable_ce:
card_num = get_cards()
ce_loss = 0
ce_time = 0
try:
ce_loss = ce_info[-2][0]
ce_time = ce_info[-2][1]
except:
logging.info("ce info err!")
print("kpis\teach_step_duration_%s_card%s\t%s" %
(args.task_name, card_num, ce_time))
print("kpis\ttrain_loss_%s_card%s\t%f" %
(args.task_name, card_num, ce_loss))
if args.do_test: if args.do_test:
if args.task_mode == "pairwise": if args.task_mode == "pairwise":
# Get Feeder and Reader # Get Feeder and Reader
...@@ -406,6 +429,14 @@ def infer(args): ...@@ -406,6 +429,14 @@ def infer(args):
os.path.join(os.getcwd(), args.infer_result_path)) os.path.join(os.getcwd(), args.infer_result_path))
def get_cards():
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
def main(conf_dict, args): def main(conf_dict, args):
""" """
main main
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册