operator matmul error
Created by: OOMMYY
我根据 https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/language_representations_kit/BERT 中介绍的方法下载 XNLI dev/test set 和 XNLI machine-translated training set,然后解压到同一个目录。 启动 Fine-tuning , 启动脚本如下:
export FLAGS_sync_nccl_allreduce=0
export FLAGS_eager_delete_tensor_gb=1
export CUDA_VISIBLE_DEVICES=${1}
BERT_BASE_PATH=hqa/uncased_L-12_H-768_A-12
TASK_NAME='XNLI'
DATA_PATH=hqa/ace/sent_v1/xnli
CKPT_PATH=hqa/checkpoints-sent
python -u run_classifier.py \
--task_name ${TASK_NAME} \
--use_cuda true \
--do_train true \
--do_val true \
--do_test true \
--batch_size 32 \
--in_tokens false \
--init_pretraining_params ${BERT_BASE_PATH}/params \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--validation_steps 100 \
--epoch 1 \
--max_seq_len 60 \
--bert_config_path ${BERT_BASE_PATH}/bert_config.json \
--learning_rate 1e-5 \
--skip_steps 10 \
--num_iteration_per_drop_scope 10 \
--use_fp16 true \
--verbose true
错误信息如下:
share_vars_from is set, scope is ignored.
I0813 17:18:54.545492 23654 parallel_executor.cc:329] The number of CUDAPlace, which is used in ParallelExecutor, is 1. And the Program will be copied 1 copies
I0813 17:18:54.568922 23654 build_strategy.cc:340] SeqOnlyAllReduceOps:0, num_trainers:1
train pyreader queue size: 50, learning rate: 0.000000
epoch: 0, progress: 1697/392702, step: 0, ave loss: nan, ave acc: 0.34375
Traceback (most recent call last):
File "run_classifier.py", line 444, in <module>
main(args)
File "run_classifier.py", line 321, in main
outputs = train_exe.run(fetch_list=fetch_list)
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/parallel_executor.py", line 280, in run
return_numpy=return_numpy)
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/executor.py", line 666, in run
return_numpy=return_numpy)
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/executor.py", line 528, in _run_parallel
exe.run(fetch_var_names, fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet: Invoke operator matmul error.
Python Callstacks:
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/framework.py", line 1771, in append_op
attrs=kwargs.get("attrs", None))
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "/home/work/anaconda2/envs/yuanzhen3/lib/python3.5/site-packages/paddle/fluid/layers/nn.py", line 5255, in matmul
'alpha': float(alpha),
File "/home/work/liuyuanzhen/baidu/event-graph/ERNIE/BERT/model/bert.py", line 116, in _build_model
x=input_mask, y=input_mask, transpose_y=True)
File "/home/work/liuyuanzhen/baidu/event-graph/ERNIE/BERT/model/bert.py", line 81, in __init__
self._build_model(src_ids, position_ids, sentence_ids, input_mask)
File "/home/work/liuyuanzhen/baidu/event-graph/ERNIE/BERT/model/classifier.py", line 48, in create_model
use_fp16=args.use_fp16)
File "run_classifier.py", line 199, in main
num_labels=num_labels)
File "run_classifier.py", line 444, in <module>
main(args)
C++ Callstacks:
CUBLAS: arch mismatch, at [/paddle/paddle/fluid/operators/math/blas_impl.cu.h:149]
PaddlePaddle Call Stacks:
0 0x7f3f380cdf00p void paddle::platform::EnforceNotMet::Init<char const*>(char const*, char const*, int) + 352
1 0x7f3f380ce279p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 137
2 0x7f3f3884a4e7p void paddle::operators::math::Blas<paddle::platform::CUDADeviceContext>::MatMul<paddle::platform::float16>(paddle::framework::Tensor const&, paddle::operators::math::MatDescriptor const&, paddle::framework::Tensor const&, paddle::operators::math::MatDescriptor const&, paddle::platform::float16, paddle::framework::Tensor*, paddle::platform::float16) const + 1415
3 0x7f3f3884aa60p paddle::operators::MatMulKernel<paddle::platform::CUDADeviceContext, paddle::platform::float16>::Compute(paddle::framework::ExecutionContext const&) const + 784
4 0x7f3f3884ab73p std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CUDAPlace, false, 2ul, paddle::operators::MatMulKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::MatMulKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::MatMulKernel<paddle::platform::CUDADeviceContext, paddle::platform::float16> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&) + 35
5 0x7f3f3a14d157p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&, paddle::framework::RuntimeContext*) const + 375
6 0x7f3f3a14d531p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 529
7 0x7f3f3a14ab2cp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 332
8 0x7f3f39f46bcap paddle::framework::details::ComputationOpHandle::RunImpl() + 250
9 0x7f3f39f39570p paddle::framework::details::OpHandleBase::Run(bool) + 160
10 0x7f3f39e9285cp paddle::framework::details::ThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*) + 316
11 0x7f3f39e8d767p paddle::framework::details::ThreadedSSAGraphExecutor::RunTracedOps(std::vector<paddle::framework::details::OpHandleBase*, std::allocator<paddle::framework::details::OpHandleBase*> > const&) + 71
12 0x7f3f39e95a7bp paddle::framework::details::ThreadedSSAGraphExecutor::RunImpl(std::vector<std::string, std::allocator<std::string> > const&) + 3547
13 0x7f3f39e92282p paddle::framework::details::ThreadedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&) + 482
14 0x7f3f39e7e69cp paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&) + 124
15 0x7f3f382a4df1p paddle::framework::ParallelExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, std::string const&) + 305
16 0x7f3f380bf25ep
17 0x7f3f38100816p
18 0x7f3f897d62d8p PyCFunction_Call + 120
19 0x7f3f89831dd6p PyEval_EvalFrameEx + 23686
20 0x7f3f89831aa4p PyEval_EvalFrameEx + 22868
21 0x7f3f89831351p PyEval_EvalFrameEx + 20993
22 0x7f3f89831351p PyEval_EvalFrameEx + 20993
23 0x7f3f8982cc20p PyEval_EvalFrameEx + 2768
24 0x7f3f898372adp PyEval_EvalCodeEx + 525
25 0x7f3f898381fcp PyEval_EvalCode + 28
26 0x7f3f898958d4p
27 0x7f3f89896f41p PyRun_FileExFlags + 161
28 0x7f3f8989715ep PyRun_SimpleFileExFlags + 478
29 0x7f3f8989780dp Py_Main + 1485
30 0x7f3f89761571p main + 225
31 0x7f3f88ea7b45p __libc_start_main + 245
32 0x7f3f89839f38p