File "conf/trainer_config.conf", line 146, in _event_handler global name 'args' is not defined
Created by: Bella-Zhao
跑MPI任务出现以下错误:
Mon Nov 27 10:39:24 2017[1,17]<stderr>:[INFO 2017-11-27 10:39:24,967 trainer_config.conf:141] Pass 0, Batch 0, Cost 2.750176, {'__auc_evaluator_0__': 0.4077380895614624, 'classification_error_evaluator': 0.984375}, Time 34.3098471165
Mon Nov 27 10:39:24 2017[1,17]<stderr>:Traceback (most recent call last):
Mon Nov 27 10:39:24 2017[1,17]<stderr>: File "conf/trainer_config.conf", line 221, in <module>
Mon Nov 27 10:39:24 2017[1,17]<stderr>: use_gpu=False)
Mon Nov 27 10:39:24 2017[1,17]<stderr>: File "conf/trainer_config.conf", line 171, in train
Mon Nov 27 10:39:24 2017[1,17]<stderr>: num_passes=num_passes)
Mon Nov 27 10:39:24 2017[1,17]<stderr>: File "/home/disk1/normandy/maybach/334836/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/v2/trainer.py", line 171, in train
Mon Nov 27 10:39:24 2017[1,17]<stderr>: evaluator=batch_evaluator))
Mon Nov 27 10:39:24 2017[1,17]<stderr>: File "conf/trainer_config.conf", line 146, in _event_handler
Mon Nov 27 10:39:24 2017[1,17]<stderr>: type=str(args.model_type))
Mon Nov 27 10:39:24 2017[1,17]<stderr>:NameError: global name 'args' is not defined
Mon Nov 27 10:39:24 2017[1,12]<stderr>:[INFO 2017-11-27 10:39:24,978 trainer_config.conf:141] Pass 0, Batch 0, Cost 2.760317, {'__auc_evaluator_0__': 0.45059287548065186, 'classification_error_evaluator': 0.98828125}, Time 37.160326004
Mon Nov 27 10:39:24 2017[1,12]<stderr>:Traceback (most recent call last):
Mon Nov 27 10:39:24 2017[1,12]<stderr>: File "conf/trainer_config.conf", line 221, in <module>
Mon Nov 27 10:39:24 2017[1,12]<stderr>: use_gpu=False)
Mon Nov 27 10:39:24 2017[1,12]<stderr>: File "conf/trainer_config.conf", line 171, in train
Mon Nov 27 10:39:24 2017[1,12]<stderr>: num_passes=num_passes)
Mon Nov 27 10:39:24 2017[1,12]<stderr>: File "/home/disk1/normandy/maybach/334836/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/v2/trainer.py", line 171, in train
Mon Nov 27 10:39:24 2017[1,12]<stderr>: evaluator=batch_evaluator))
Mon Nov 27 10:39:24 2017[1,12]<stderr>: File "conf/trainer_config.conf", line 146, in _event_handler
Mon Nov 27 10:39:24 2017[1,12]<stderr>: type=str(args.model_type))
Mon Nov 27 10:39:24 2017[1,12]<stderr>:NameError: global name 'args' is not defined
Mon Nov 27 10:39:24 2017[1,0]<stderr>:[INFO 2017-11-27 10:39:24,996 trainer_config.conf:141] Pass 0, Batch 0, Cost 2.748219, {'__auc_evaluator_0__': 0.391865074634552, 'classification_error_evaluator': 0.984375}, Time 56.4254510403
Mon Nov 27 10:39:25 2017[1,0]<stderr>:Traceback (most recent call last):
Mon Nov 27 10:39:25 2017[1,0]<stderr>: File "conf/trainer_config.conf", line 221, in <module>
Mon Nov 27 10:39:25 2017[1,0]<stderr>: use_gpu=False)
Mon Nov 27 10:39:25 2017[1,0]<stderr>: File "conf/trainer_config.conf", line 171, in train
Mon Nov 27 10:39:25 2017[1,0]<stderr>: num_passes=num_passes)
Mon Nov 27 10:39:25 2017[1,0]<stderr>: File "/home/disk1/normandy/maybach/334836/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/v2/trainer.py", line 171, in train
Mon Nov 27 10:39:25 2017[1,0]<stderr>: evaluator=batch_evaluator))
Mon Nov 27 10:39:25 2017[1,0]<stderr>: File "conf/trainer_config.conf", line 146, in _event_handler
Mon Nov 27 10:39:25 2017[1,0]<stderr>: type=str(args.model_type))
Mon Nov 27 10:39:25 2017[1,0]<stderr>:NameError: global name 'args' is not defined
看起来是每个结点的batch 1 出现这个错误,奇怪的是本地没有conf/trainer_config.conf这个文件,用的应该是集群上默认的,所以不清楚为什么会有NameError: global name 'args' is not defined
相关运行代码如下:
paddle cluster_train \
--config=train_mpi.py \
--time_limit=72:00:00 \
--submitter=zhaoyijin \
--num_nodes=20 \
--job_priority=normal \
--fs_name=...... \
--fs_ugi=...... \
--num_passes=20 \
--train_data_path=...... \
--test_data_path=...... \
--output_path=...... \
--thirdparty=./my_thirdparty \
--where=...... \
--job_name=paddle_dssm_zhaoyijin \
--ports_num_for_sparse=1 \
--use_remote_sparse=1
paddle.init(use_gpu=False,
trainer_count=int(os.getenv("PADDLE_TRAINER_COUNT", "1")),
port=int(os.getenv("PADDLE_PORT", "7164")),
ports_num=int(os.getenv("PADDLE_PORTS_NUM", "1")),
num_gradient_servers=int(os.getenv("PADDLE_NUM_GRADIENT_SERVERS", "1")),
trainer_id=int(os.getenv("PADDLE_TRAINER_ID", "0")),
pservers=os.getenv("PADDLE_PSERVERS", "127.0.0.1"),
ports_num_for_sparse=int(os.getenv('PADDLE_PORTS_NUM_FOR_SPARSE', "1")))
train(
train_data_path='./train_data_dir/train',
test_data_path='./test_data_dir/test',
dic_path=dict_file,
model_type=ModelType.create_classification(),
batch_size=256,
num_passes=1,
share_semantic_generator=False,
share_embed=False,
class_num=2,
num_workers=5,
use_gpu=False)
def train(train_data_path=None,
test_data_path=None,
dic_path=None,
model_type=None,
batch_size=10,
num_passes=10,
share_semantic_generator=False,
share_embed=False,
class_num=None,
num_workers=1,
use_gpu=False):
"""
brief info for: train
Args:
Return:
Raise:
"""
if not train_data_path or not test_data_path:
logger.error("No input data")
exit(1)
t = time.time()
dataset = reader.Dataset(
train_path=train_data_path,
test_path=test_data_path,
dic_path=dic_path,
model_type=model_type, )
logger.info("reader.DataSet cost time %s" % (time.time() - t))
t = time.time()
train_reader = paddle.batch(
paddle.reader.shuffle(dataset.train_mpi, buf_size=1000),
batch_size=batch_size)
logger.info("train_reader batch cost time %s" % (time.time() - t))
t = time.time()
test_reader = paddle.batch(
paddle.reader.shuffle(dataset.test_mpi, buf_size=1000),
batch_size=batch_size)
logger.info("test_reader batch cost time %s" % (time.time() - t))
t = time.time()
cost, prediction, label = DSSM(
dnn_dims=layer_dims,
feature_size_dict=dataset.load_feature_size_dic(dic_path),
model_type=model_type,
share_semantic_generator=share_semantic_generator,
class_num=class_num,
share_embed=share_embed)()
logger.info("DSSM cost time %s" % (time.time() - t))
t = time.time()
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.AdaGrad(
learning_rate=1e-2,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
trainer = paddle.trainer.SGD(
cost=cost,
extra_layers=paddle.evaluator.auc(input=prediction, label=label)
if not model_type.is_rank() else None,
parameters=parameters,
update_equation=adam_optimizer,
is_local=False)
logger.info("trainer cost time %s" % (time.time() - t))
t = time.time()
def _event_handler(event):
global t
'''
Define batch handler
'''
if isinstance(event, paddle.event.EndIteration):
# output train log
if event.batch_id % 1 == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s, Time %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics, time.time() - t))
t = time.time()
if event.batch_id % 100 == 0:
# save model
model_desc = "{type}".format(
type=str(args.model_type))
with open("%sdssm_%s_pass_%05d.tar" %
('./output/', model_desc, event.pass_id), "w") as f:
to_tar(parameters, f)
if isinstance(event, paddle.event.EndPass):
# test model
if test_reader is not None:
if model_type.is_classification():
result = trainer.test(
reader=test_reader, feeding=feeding)
logger.info("Test at Pass %d, %s" % (event.pass_id,
result.metrics))
else:
result = None
# save model
model_desc = "classification"
with open("%sdssm_%s_pass_%05d.tar" %
('./output/', model_desc, event.pass_id), "w") as f:
to_tar(parameters, f)
trainer.train(
reader=train_reader,
event_handler=_event_handler,
feeding=feeding,
num_passes=num_passes)
logger.info("Training has finished.")