训练中出core:Check failed: ownerThreadId_ == getTID() (56760 vs. 74752) this sync thread pool should be used in one thread
Created by: hweidream
先启了一个线程进行训练,再该线程结束后启动另一线程训练,不确定是不是因为前一线程未结束。
代码:
def async_call(fn):
def wrapper(*args, **kwargs):
Thread(target=fn, args=args, kwargs=kwargs).start()
return wrapper
###多线程启动部分
def feed_data2train(self, input_train_info):
paddle.init(trainer_count=self.param_conf_dict['trainer_count'])
postag_input_size = self.data_reader.get_feature_size('postag')
ner_input_size = self.data_reader.get_feature_size('ner')
position_input_size = self.data_reader.get_feature_size('position')
mark_input_size = self.data_reader.get_feature_size('mark')
label_size = self.data_reader.get_feature_size('label')
# define network topology,db_lstm是自己定义的一个网络结构
feature_out, assist_info = db_lstm(postag_input_size, ner_input_size,
position_input_size, mark_input_size, label_size, self.param_conf_dict)
target = paddle.layer.data(name='o_label',
type=paddle.data_type.integer_value_sequence(label_size))
self.cost = paddle.layer.classification_cost(input=feature_out, label=target)
self._start_train(train_data_path, eval_data_path, model_output_dir, model_file_name, cur_jobid)
return ret_info
@async_call
def _start_train(self, train_data_path, eval_data_path, model_output_dir, model_file_name, cur_jobid):
train_batch_reader = paddle.batch(
paddle.reader.shuffle(
self.data_reader.get_train_reader(train_data_path),
buf_size=10000),
batch_size=self.param_conf_dict['train_batch_size'])
test_batch_reader=paddle.batch(
paddle.reader.shuffle(
self.data_reader.get_test_reader(eval_data_path),
buf_size=10000),
batch_size=self.param_conf_dict['test_batch_size'])
feeding = self.data_reader.data_name_feeding()
parameters = paddle.parameters.create(self.cost)
# create optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=2e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
gradient_clipping_threshold=10.0,
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
trainer = paddle.trainer.SGD(
cost=self.cost,
parameters=parameters,
update_equation=optimizer)
def event_handler(event):
global start_time, cost_sum, cost_counter, smallest_cost
if isinstance(event, paddle.event.EndIteration):
cost_sum += event.cost
cost_counter += 1
cur_time = time.time()
if (cur_time - start_time) >= self.end_condition_time:
if self.best_model_name != '':
##时间到设定
return
if event.batch_id % 500 == 0 and event.batch_id > 0:
log_info = "Pass %d, Batch %d, Cost %f, time %d s" % (
event.pass_id, event.batch_id, cost_sum / cost_counter,
time.time() - start_time)
#print log_info
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
smallest_cost = sys.maxint * 1.0
cost_sum, cost_counter = 0.0, 0
self.cur_pass_id = 0
if isinstance(event, paddle.event.EndPass):
# save parameters
output_file_path = '%s/params_pass_%d.tar.gz' \
% (model_output_dir, event.pass_id)
with open(output_file_path, 'w') as f:
parameters.to_tar(f)
cost_avg = cost_sum / cost_counter if cost_counter > 0 else 0.0
try:
result = trainer.test(reader=test_batch_reader, feeding=feeding)
if smallest_cost > result.cost:
smallest_cost = result.cost
self.best_model_name = 'params_pass_%d.tar.gz' % (event.pass_id)
self.cur_pass_id = event.pass_id
log_info = "END: Time: %d sec, Pass: %d, Train_Cost: %s, Dev_Cost: %s" % (
time.time() - start_time, event.pass_id, cost_avg, result.cost)
except:
log_info = "END: Time: %d sec, Pass: %d, Train_Cost: %s" % (
time.time() - start_time, event.pass_id, cost_avg)
#print '\n' + log_info
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=self.param_conf_dict['num_passes'],
feeding=feeding)
报错信息如下:
14290 .....F1211 20:38:21.947131 74752 Thread.h:200] Check failed: ownerThreadId_ == getTID() (56760 vs. 74752) this sync thread pool should be used in one thread
14291 *** Check failure stack trace: ***
14292 @ 0x7f4645ac9dfd google::LogMessage::Fail()
14293 @ 0x7f4645acd8ac google::LogMessage::SendToLog()
14294 @ 0x7f4645ac9923 google::LogMessage::Flush()
14295 @ 0x7f4645acedbe google::LogMessageFatal::~LogMessageFatal()
14296 @ 0x7f46471c2985 paddle::SgdThreadUpdater::finishBatch()
14297 @ 0x7f4645724e60 _wrap_ParameterUpdater_finishBatch
14298 @ 0x4b4cb9 PyEval_EvalFrameEx
14299 @ 0x4b6b28 PyEval_EvalCodeEx
14300 @ 0x4b5d10 PyEval_EvalFrameEx
14301 @ 0x4b6b28 PyEval_EvalCodeEx
14302 @ 0x4b5d10 PyEval_EvalFrameEx
14303 @ 0x4b6b28 PyEval_EvalCodeEx
14304 @ 0x52940f function_call
14305 @ 0x422cba PyObject_Call
14306 @ 0x4b1bd0 PyEval_EvalFrameEx
14307 @ 0x4b5fb8 PyEval_EvalFrameEx
14308 @ 0x4b5fb8 PyEval_EvalFrameEx
14309 @ 0x4b6b28 PyEval_EvalCodeEx
14310 @ 0x529340 function_call
14311 @ 0x422cba PyObject_Call
14312 @ 0x4271ad instancemethod_call
14313 @ 0x422cba PyObject_Call
14314 @ 0x4b0427 PyEval_CallObjectWithKeywords
14315 @ 0x4fa752 t_bootstrap
14316 @ 0x7f46deec61c3 start_thread
14317 @ 0x7f46de4ee12d __clone
14318 @ (nil) (unknown)