Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • Paddle
  • Issue
  • #9200

P
Paddle
  • 项目概览

PaddlePaddle / Paddle
大约 2 年 前同步成功

通知 2325
Star 20933
Fork 5424
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 1423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
  • Wiki 0
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
P
Paddle
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 1,423
    • Issue 1,423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
    • 合并请求 543
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 0
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 3月 19, 2018 by saxon_zh@saxon_zhGuest

多线程时,同一输入做两个embedding程序会崩溃

Created by: jshower

在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。

import sys
import os
import math

import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer


def ner_net(word_dict_len, label_dict_len, parallel):
    IS_SPARSE = True
    #embedding_name = 'emb'
    #word_dict_len = 1942562
    word_dim = 32
    mention_dict_len = 57
    mention_dim = 20
    grnn_hidden = 36
    #label_dict_len = 49
    
    def _net_conf(word, mark, target):
        word_embedding = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        mention_embedding = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        word_embedding_r = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        mention_embedding_r = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        word_mention_vector = fluid.layers.concat(
            input=[word_embedding, mention_embedding], axis=1)

        word_mention_vector_r = fluid.layers.concat(
            input=[word_embedding_r, mention_embedding_r], axis=1)
    
        pre_gru = fluid.layers.fc(input = word_mention_vector, 
            size = grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru = fluid.layers.dynamic_gru(input=pre_gru, 
            size=grnn_hidden,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        pre_gru_r = fluid.layers.fc(input=word_mention_vector_r, 
            size=grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
            size=grnn_hidden,
            is_reverse=True,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)

        emission = fluid.layers.fc(
            size=label_dict_len,
            input=gru_merged,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        crf_cost = fluid.layers.linear_chain_crf(
            input=emission,
            label=target,
            param_attr=fluid.ParamAttr(
                name='crfw',
                learning_rate=0.2,
                #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
        ))
        avg_cost = fluid.layers.mean(x=crf_cost)
        return avg_cost, emission

    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
    mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
    target = fluid.layers.data(
        name="target", shape=[1], dtype='int64', lod_level=1)

    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            word_ = pd.read_input(word)
            mention_ = pd.read_input(mention)
            target_ = pd.read_input(target)
            avg_cost, emission_base = _net_conf(word_, mention_, target_)
            pd.write_output(avg_cost)
            pd.write_output(emission_base)
        avg_cost_list, emission = pd()
        avg_cost = fluid.layers.mean(x=avg_cost_list)
        emission.stop_gradient = True
    else:
        avg_cost, emission = _net_conf(word, mention, target)

    return avg_cost, emission, word, mention, target

将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。

import sys
import os
import math

import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer


def ner_net(word_dict_len, label_dict_len, parallel):
    IS_SPARSE = True
    #embedding_name = 'emb'
    #word_dict_len = 1942562
    word_dim = 32
    mention_dict_len = 57
    mention_dim = 20
    grnn_hidden = 36
    #label_dict_len = 49
    
    def _net_conf(word, mark, target):
        word_embedding = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))

        mention_embedding = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
        '''        
        word_embedding_r = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))

        mention_embedding_r = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
        '''
        word_mention_vector = fluid.layers.concat(
            input=[word_embedding, mention_embedding], axis=1)
        '''
        word_mention_vector_r = fluid.layers.concat(
            input=[word_embedding_r, mention_embedding_r], axis=1)
        '''    
        pre_gru = fluid.layers.fc(input = word_mention_vector, 
            size = grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru = fluid.layers.dynamic_gru(input=pre_gru, 
            size=grnn_hidden,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        pre_gru_r = fluid.layers.fc(input=word_mention_vector, 
            size=grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
            size=grnn_hidden,
            is_reverse=True,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)

        emission = fluid.layers.fc(
            size=label_dict_len,
            input=gru_merged,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        crf_cost = fluid.layers.linear_chain_crf(
            input=emission,
            label=target,
            param_attr=fluid.ParamAttr(
                name='crfw',
                learning_rate=0.2,
                #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
        ))
        avg_cost = fluid.layers.mean(x=crf_cost)
        return avg_cost, emission

    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
    mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
    target = fluid.layers.data(
        name="target", shape=[1], dtype='int64', lod_level=1)

    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            word_ = pd.read_input(word)
            mention_ = pd.read_input(mention)
            target_ = pd.read_input(target)
            avg_cost, emission_base = _net_conf(word_, mention_, target_)
            pd.write_output(avg_cost)
            pd.write_output(emission_base)
        avg_cost_list, emission = pd()
        avg_cost = fluid.layers.mean(x=avg_cost_list)
        emission.stop_gradient = True
    else:
        avg_cost, emission = _net_conf(word, mention, target)

    return avg_cost, emission, word, mention, target
F0319 08:04:48.310140  3321 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception.
The default exception handler is LOG(FATAL).enforce dtype != -1 failed, -1 == -1
Sum operator should have at least one tensor at [/paddle_gpu/Paddle/paddle/fluid/operators/sum_op.cc:73]
PaddlePaddle Call Stacks: 
0       0x7f2c50a1c48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1       0x7f2c514f6138p paddle::operators::SumOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 1912
2       0x7f2c5161682dp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 381
3       0x7f2c50acd4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781
4       0x7f2c50acea5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63
5       0x7f2c513f38b3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99
6       0x7f2c513f058ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46
7       0x7f2c8a954a99p
8       0x7f2c513f0bd2p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146
9       0x7f2c513f0d46p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86
10      0x7f2c516315d4p paddle::framework::ThreadPool::TaskLoop() + 1012
11      0x7f2c7eab8c80p
12      0x7f2c8a94d6bap
13      0x7f2c8a68341dp clone + 109
*** Check failure stack trace: ***
    @     0x7f2c517decad  google::LogMessage::Fail()
    @     0x7f2c517e0ff8  google::LogMessage::SendToLog()
    @     0x7f2c517de7bb  google::LogMessage::Flush()
    @     0x7f2c517e1ece  google::LogMessageFatal::~LogMessageFatal()
    @     0x7f2c513f1847  std::_Function_handler<>::_M_invoke()
    @     0x7f2c513f058e  std::__future_base::_State_baseV2::_M_do_set()
    @     0x7f2c8a954a99  __pthread_once_slow
    @     0x7f2c513f0bd2  std::__future_base::_State_baseV2::_M_set_result()
    @     0x7f2c513f0c91  std::__future_base::_Deferred_state<>::_M_complete_async()
    @     0x7f2c513fa32a  paddle::operators::ParallelDoGradOp::RunImpl()
    @     0x7f2c50acd4a5  paddle::framework::Executor::RunPreparedContext()
    @     0x7f2c50acea5f  paddle::framework::Executor::Run()
    @     0x7f2c50a38fc3  _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_
    @     0x7f2c50a36d04  pybind11::cpp_function::dispatcher()
    @           0x4c37ed  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4c16e7  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4c1e6f  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4eb30f  (unknown)
    @           0x4e5422  PyRun_FileExFlags
    @           0x4e3cd6  PyRun_SimpleFileExFlags
    @           0x493ae2  Py_Main
    @     0x7f2c8a59c830  __libc_start_main
    @           0x4933e9  _start
    @              (nil)  (unknown)
Aborted

如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。

指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/Paddle#9200
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7