使用crf层,在多线程GPU下,如果batch_size不为1会出错
Created by: jshower
发现的问题是使用包含crf层的代码时,多线程(ParallelDo)在单个gpu下,如果batch_size不为1(比如10),会出文末粘贴的问题。值的注意的是,这和#9234 (closed) 的错误是一致的。 使用的代码是:
import sys
import math
import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.fluid as fluid
import contextlib
import time
import unittest
from five_corss_val import filereader
sys.stdout.flush()
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 10
IS_SPARSE = True
PASS_NUM = 1
BATCH_SIZE = 1
embedding_name = 'emb'
default_std = 1 / math.sqrt(hidden_dim) / 3.0
def load_parameter(file_name, h, w):
with open(file_name, 'rb') as f:
f.read(16) # skip header.
return np.fromfile(f, dtype=np.float32).reshape(h, w)
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
places = fluid.layers.get_places(device_count=0)
pd = fluid.layers.ParallelDo(places)
with pd.do():
word_ = pd.read_input(word)
predicate_ = pd.read_input(predicate)
ctx_n2_ = pd.read_input(ctx_n2)
ctx_n1_ = pd.read_input(ctx_n1)
ctx_0_ = pd.read_input(ctx_0)
ctx_p1_ = pd.read_input(ctx_p1)
ctx_p2_ = pd.read_input(ctx_p2)
mark_ = pd.read_input(mark)
target_ = pd.read_input(target)
predicate_embedding = fluid.layers.embedding(
input=predicate_,
size=[pred_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb', learning_rate=5))
mark_embedding = fluid.layers.embedding(
input=mark_,
size=[mark_dict_len, mark_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='mark_emb', learning_rate=5))
word_input = [word_, ctx_n2_, ctx_n1_, ctx_0_, ctx_p1_, ctx_p2_]
emb_layers = [
fluid.layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim, act="tanh") for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act="tanh"),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act="tanh")
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act="tanh"),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act="tanh")
])
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=mix_hidden_lr))
avg_cost1 = fluid.layers.mean(x=crf_cost)
pd.write_output(avg_cost1)
pd.write_output(feature_out)
avg_cost_on_each_devs, feature_out_on_each_devs = pd()
feature_out_on_each_devs.stop_gradient = True
avg_cost = fluid.layers.mean(x=avg_cost_on_each_devs)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
crf_decode = fluid.layers.crf_decoding(
input=feature_out_on_each_devs, param_attr=fluid.ParamAttr(name='crfw'))
(precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval(input=crf_decode,
label=target,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program([])
train_data = paddle.batch(
paddle.reader.shuffle(
filereader.file_reader("five_corss_val/" + str(sys.argv[1])), buf_size=8192),
batch_size=BATCH_SIZE)
test_data = paddle.batch(
paddle.reader.shuffle(
filereader.file_reader("five_corss_val/" + str(sys.argv[2])), buf_size=8192),
batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
], place=place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
start_time = time.time()
#batch_id = 0
for pass_id in xrange(PASS_NUM):
#chunk_evaluator.reset(exe)
time_start = time.time()
sum_infer = 0
sum_label = 0
sum_correct = 0
for data in train_data():
start_time = time.time()
cost, num_infer, num_label, num_correct = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks])
sum_infer += num_infer
sum_label += num_label
sum_correct += num_correct
print("cost:" + str(cost[0]), ", num_infer:" + str(num_infer[0]) + ", num_label:" + str(num_label[0]) + ", num_correct:" + str(num_correct))
precision = 0
recall = 0
f1_score = 0
if sum_infer != 0:
precision = sum_correct * 1.0 / sum_infer
if sum_label != 0:
recall = sum_correct * 1.0 / sum_label
if precision != 0 or recall != 0:
f1_score = precision * recall * 2.0 / (precision + recall)
print("pass_id:" + str(pass_id) + ", precision:" + str(precision) + ", recall:" + str(recall) + ", f1_score:" + str(f1_score))
time_end = time.time()
print("pass_id:" + str(pass_id) + ", cost_time:" + str(time_end - time_start))
save_dirname = sys.argv[2] + ".save_model_multi_thread." + str(pass_id)
fluid.io.save_inference_model(save_dirname, [
'word_data', 'verb_data', 'ctx_n2_data',
'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
'ctx_p2_data', 'mark_data', 'target'], [num_infer_chunks, num_label_chunks, num_correct_chunks], exe)
这是对Paddle/python/paddle/fluid/tests/book/test_label_semantic_roles.py的一个多线程改写的尝试。通过将test_label_semantic_roles.py变成多线程来复现这个问题,也可以联系我提供环境。
Traceback (most recent call last):
File "new_srl_base_tanh_MultiThread.py", line 289, in <module>
fetch_list=[avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks])
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/executor.py", line 349, in run
self.executor.run(program_cache.desc, scope, 0, True, True)
paddle.fluid.core.EnforceNotMet: var crfw@GRAD is both input and output, does not support transform at [/paddle_gpu/Paddle/paddle/fluid/framework/operator.cc:535]
PaddlePaddle Call Stacks:
0 0x7f71c45bd48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1 0x7f71c51b818fp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 2783
2 0x7f71c4f96af2p paddle::operators::ParallelDoGradOp::AccumulateGrad(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, std::vector<paddle::framework::Scope*, std::allocator<paddle::framework::Scope*> > const&, std::vector<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::allocator<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > const&) const + 2482
3 0x7f71c4f9b3bcp paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 2220
4 0x7f71c466e4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781
5 0x7f71c466fa5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63
6 0x7f71c45d9fc3p void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}, void, paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}&&, void (*)(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call) + 579
7 0x7f71c45d7d04p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 1236
8 0x4c37edp PyEval_EvalFrameEx + 31165
9 0x4b9ab6p PyEval_EvalCodeEx + 774
10 0x4c16e7p PyEval_EvalFrameEx + 22711
11 0x4b9ab6p PyEval_EvalCodeEx + 774
12 0x4eb30fp
13 0x4e5422p PyRun_FileExFlags + 130
14 0x4e3cd6p PyRun_SimpleFileExFlags + 390
15 0x493ae2p Py_Main + 1554
16 0x7f721003b830p __libc_start_main + 240
17 0x4933e9p _start + 41