AI studio训练LSTM报错ExternalError: Cudnn error, CUDNN_STATUS_EXECUTION_FAILED
Created by: FredMushZhao
可以单独运行网络结构部分代码,但是开始训练后就报错,代码如下 网络结构
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
data = fluid.data(name='poetry', shape=[BATCH_SIZE, SEQ_SIZE], dtype='int64')
label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
hid1 = fluid.data(name='H2L', shape=[BATCH_SIZE, NUM_LAYERS, HIDDEN_DIM], dtype='float32')
hid2 = fluid.data(name='C2L', shape=[BATCH_SIZE, NUM_LAYERS, HIDDEN_DIM], dtype='float32')
hid1 = fluid.layers.reshape(x=hid1, shape=[NUM_LAYERS, BATCH_SIZE, HIDDEN_DIM])
hid2 = fluid.layers.reshape(x=hid2, shape=[NUM_LAYERS, BATCH_SIZE, HIDDEN_DIM])
#LSTM
data = fluid.embedding(data, size=[DICT_SIZE, EMBD_SIZE], is_sparse=True)
output, h, c = fluid.layers.lstm(data, hid1, hid2, max_len=64, hidden_size=HIDDEN_DIM, num_layers=NUM_LAYERS)
output = fluid.layers.reshape(output, [BATCH_SIZE*SEQ_SIZE, -1])
prediction = fluid.layers.fc(output, DICT_SIZE, act='softmax')
loss = fluid.layers.cross_entropy(prediction, label)
avg_loss = fluid.layers.mean(loss)
opt = fluid.optimizer.Adam(learning_rate=0.002)
re = opt.minimize(avg_loss)
训练部分
feed_order = ['poetry', 'label', 'H2L', 'C2L']
pass_num = 20
def train(main_program):
exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_order, place=place)
poetry, ix2word, word2ix = loadData()
train_reader = readData(poetry, ix2word, word2ix)
# blank = np.zeros([BATCH_SIZE, NUM_LAYERS, HIDDEN_DIM], dtype='float32')
for epoch_id in range(pass_num):
for step_id, data in enumerate(train_reader()):
metrics = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_loss])
if step_id % 90 == 0:
print("Step {0}, Epoch {1} Metrics {2}".format(step_id, epoch_id, list(map(np.array, metrics))))
break
break
fluid.io.save_inference_model(params_dirname, ['poetry', 'H2L', 'C2L'], [prediction, h, c], exe)
报错如下
---------------------------------------------------------------------------EnforceNotMet Traceback (most recent call last)<ipython-input-7-1e1a4d6df723> in <module>
1 main_program = fluid.default_main_program()
----> 2 train(main_program)
<ipython-input-5-01d96f492be1> in train(main_program)
11 for epoch_id in range(pass_num):
12 for step_id, data in enumerate(train_reader()):
---> 13 metrics = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_loss])
14
15 if step_id % 90 == 0:
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1069 warnings.warn(
1070 "The following exception is not an EOF exception.")
-> 1071 six.reraise(*sys.exc_info())
1072
1073 def _run_impl(self, program, feed, fetch_list, feed_var_name,
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1064 use_program_cache=use_program_cache,
1065 use_prune=use_prune,
-> 1066 return_merged=return_merged)
1067 except Exception as e:
1068 if not isinstance(e, core.EOFException):
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_impl(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache, return_merged, use_prune)
1152 scope=scope,
1153 return_numpy=return_numpy,
-> 1154 use_program_cache=use_program_cache)
1155
1156 program._compile(scope, self.place)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py in _run_program(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)
1227 if not use_program_cache:
1228 self._default_executor.run(program.desc, scope, 0, True, True,
-> 1229 fetch_var_name)
1230 else:
1231 self._default_executor.run_prepared_ctx(ctx, scope, False, False,
EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2 paddle::operators::CudnnLSTMGPUKernel<float>::Compute(paddle::framework::ExecutionContext const&) const
3 std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::CudnnLSTMGPUKernel<float> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
4 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
5 paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
6 paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
7 paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
8 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
9 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string> > const&, bool, bool)
------------------------------------------
Python Call Stacks (More useful to users):
------------------------------------------
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/framework.py", line 2610, in append_op
attrs=kwargs.get("attrs", None))
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layers/rnn.py", line 2188, in lstm
'seed': seed,
File "<ipython-input-4-a14aeaabfd52>", line 12, in <module>
output, h, c = fluid.layers.lstm(data, hid1, hid2, max_len=64, hidden_size=HIDDEN_DIM, num_layers=NUM_LAYERS)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3183, in run_ast_nodes
if (yield from self.run_code(code, result)):
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3018, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
coro.send(None)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2843, in _run_cell
return runner(coro)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2817, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
user_expressions, allow_stdin,
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 357, in process_one
yield gen.maybe_future(dispatch(*args))
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/gen.py", line 1147, in run
yielded = self.gen.send(value)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/gen.py", line 1233, in inner
self.run()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/ioloop.py", line 758, in _run_callback
ret = callback()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/asyncio/base_events.py", line 1771, in _run_once
handle._run()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/asyncio/base_events.py", line 534, in run_forever
self._run_once()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 505, in start
self.io_loop.start()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/traitlets/config/application.py", line 664, in launch_instance
app.start()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
----------------------
**Error Message Summary:
----------------------
ExternalError: Cudnn error, CUDNN_STATUS_EXECUTION_FAILED at (/paddle/paddle/fluid/operators/cudnn_lstm_op.cu.cc:113)
[operator < cudnn_lstm > error]**