Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • Paddle
  • Issue
  • #15979

P
Paddle
  • 项目概览

PaddlePaddle / Paddle
大约 2 年 前同步成功

通知 2325
Star 20933
Fork 5424
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 1423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
  • Wiki 0
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
P
Paddle
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 1,423
    • Issue 1,423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
    • 合并请求 543
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 0
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 2月 28, 2019 by saxon_zh@saxon_zhGuest

Multitask分类任务训练loss为nan

Created by: yuanyc06

目前我这边遇到了一个loss变为nan的问题,具体背景是这样:我打算将一个现有简单分类网络变为multitask,即将最后的单个fc层变为2个分支(之前的主干不变,只是最后接出两个并列的fc层),每个fc对应一个独立的分类任务。目前发现训练不久后loss变为极大,之后变为nan。试过了减小lr和gradient clipping,都不管用。试了用caffe代码实现同样的网络,训练时loss正常;也试了单独各自训练两个task(每次只有对应的1个fc),loss也都正常。感觉是我paddle实现的代码哪里有问题,train.py里几个主要函数net_config, build_program, train相关代码如下,对应args.multitask=True的分支:

def net_config(image, label, model, args):
    model_list = [m for m in dir(models) if "__" not in m]
    assert args.model in model_list,"{} is not lists: {}".format(
        args.model, model_list)

    class_dim = args.class_dim
    class_dim_mt = json.loads(args.class_dim_mt)
    model_name = args.model

    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
        model.params["dropout_seed"] = 100
        class_dim = 102

    if model_name == "GoogleNet":
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        #acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
        acc_top5 = acc_top1
    elif model_name == 'SE_ResNeXt50_32x4d' and args.multitask == True:
        # in this case label is a list
        out0, out1 = model.net(input=image, class_dim=class_dim_mt, multitask=True)
        cost0, pred0 = fluid.layers.softmax_with_cross_entropy(out0, label[0], return_softmax=True, ignore_index=-1)
        cost1, pred1 = fluid.layers.softmax_with_cross_entropy(out1, label[1], return_softmax=True, ignore_index=-1)
        if args.gradient_clipping:
            gc = float(args.gradient_clipping_th)
            cost0 = fluid.layers.clip(cost0, -gc, gc)
            cost1 = fluid.layers.clip(cost1, -gc, gc)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost = avg_cost0 + avg_cost1
        if args.scale_loss > 1:
            avg_cost = avg_cost * float(args.scale_loss)
        acc_top1_0 = fluid.layers.accuracy(input=pred0, label=label[0], k=1)
        acc_top5_0 = acc_top1_0
        acc_top1_1 = fluid.layers.accuracy(input=pred1, label=label[1], k=1)
        acc_top5_1 = acc_top1_1
        return avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1
    else:
        print('label:', label)
        out = model.net(input=image, class_dim=class_dim)
        cost, pred = fluid.layers.softmax_with_cross_entropy(out, label, return_softmax=True)
        if args.scale_loss > 1:
            avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss)
        else:
            avg_cost = fluid.layers.mean(x=cost)

        acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
        #acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
        acc_top5 = acc_top1

    return avg_cost, acc_top1, acc_top5

def build_program(is_train, main_prog, startup_prog, args):
    image_shape = [int(m) for m in args.image_shape.split(",")]
    model_name = args.model
    model_list = [m for m in dir(models) if "__" not in m]
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    model = models.__dict__[model_name]()
    with fluid.program_guard(main_prog, startup_prog):
        if not args.multitask:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1]],
                lod_levels=[0, 0],
                dtypes=["float32", "int64"],
                use_double_buffer=True)
        else:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1], [-1, 1]],
                lod_levels=[0, 0, 0],
                dtypes=["float32", "int64", "int64"],
                use_double_buffer=True)
        with fluid.unique_name.guard():
            if not args.multitask:
                image, label = fluid.layers.read_file(py_reader)
            else:
                image, label0, label1 = fluid.layers.read_file(py_reader)
                label = [label0, label1]
            if args.fp16:
                image = fluid.layers.cast(image, "float16")
            if not args.multitask:
                avg_cost, acc_top1, acc_top5 = net_config(image, label, model, args)
                avg_cost.persistable = True
                acc_top1.persistable = True
                acc_top5.persistable = True
            else:
                avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1 = net_config(image, label, model, args)
                avg_cost.persistable = True
                acc_top1_0.persistable = True
                acc_top5_0.persistable = True
                acc_top1_1.persistable = True
                acc_top5_1.persistable = True

            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
                params["num_epochs"] = args.num_epochs
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy

                optimizer = optimizer_setting(params)

                if args.fp16:
                    params_grads = optimizer.backward(avg_cost)
                    master_params_grads = create_master_params_grads(
                        params_grads, main_prog, startup_prog, args.scale_loss)
                    optimizer.apply_gradients(master_params_grads)
                    master_param_to_train_param(master_params_grads, params_grads, main_prog)
                else:
                    optimizer.minimize(avg_cost)
    if not args.multitask:
        return py_reader, avg_cost, acc_top1, acc_top5
    else:
        return py_reader, avg_cost, acc_top1_0, acc_top5_0, acc_top1_1, acc_top5_1


def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000

    print('start build_program')
    if not args.multitask:
        print('single task')
        train_py_reader, train_cost, train_acc1, train_acc5 = build_program(
            is_train=True,
            main_prog=train_prog,
            startup_prog=startup_prog,
            args=args)
        test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
            is_train=False,
            main_prog=test_prog,
            startup_prog=startup_prog,
            args=args)
    else:
        print('multi task')
        train_py_reader, train_cost, train_acc1_0, train_acc5_0, train_acc1_1, train_acc5_1 = build_program(
            is_train=True,
            main_prog=train_prog,
            startup_prog=startup_prog,
            args=args)
        test_py_reader, test_cost, test_acc1_0, test_acc5_0, test_acc1_1, test_acc5_1 = build_program(
            is_train=False,
            main_prog=test_prog,
            startup_prog=startup_prog,
            args=args)
    print('finished build_program')
    test_prog = test_prog.clone(for_test=True)

    if with_memory_optimization:
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)

    visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
    if visible_device:
        device_num = len(visible_device.split(','))
    else:
        device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')

    train_batch_size = args.batch_size / device_num
    test_batch_size = 32
    if not args.enable_ce:
        train_reader = paddle.batch(
            reader.train(file_list=args.train_list, data_dir=args.data_dir), batch_size=train_batch_size, drop_last=True)
        test_reader = paddle.batch(reader.val(file_list=args.test_list, data_dir=args.data_dir), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
        np.random.seed(0)
        train_reader = paddle.batch(
            flowers.train(use_xmap=False),
            batch_size=train_batch_size,
            drop_last=True)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)
    train_exe = fluid.ParallelExecutor(
        main_program=train_prog,
        use_cuda=bool(args.use_gpu),
        loss_name=train_cost.name)

    print('start training')
    if not args.multitask:
        print('single training')
        train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
        test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]

        params = models.__dict__[args.model]().params

        for pass_id in range(params["num_epochs"]):

            train_py_reader.start()

            train_info = [[], [], []]
            test_info = [[], [], []]
            train_time = []
            batch_id = 0
            try:
                while True:
                    t1 = time.time()
                    loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list)
                    t2 = time.time()
                    period = t2 - t1
                    loss = np.mean(np.array(loss))
                    acc1 = np.mean(np.array(acc1))
                    acc5 = np.mean(np.array(acc5))
                    train_info[0].append(loss)
                    train_info[1].append(acc1)
                    train_info[2].append(acc5)
                    train_time.append(period)
                    if batch_id % 100 == 0:
                        print("Pass {0}, trainbatch {1}, loss {2}, \
                            acc1 {3}, acc5 {4} time {5}"
                              .format(pass_id, batch_id, loss, acc1, acc5,
                                      "%2.2f sec" % period))
                        sys.stdout.flush()
                    batch_id += 1
            except fluid.core.EOFException:
                train_py_reader.reset()

            train_loss = np.array(train_info[0]).mean()
            train_acc1 = np.array(train_info[1]).mean()
            train_acc5 = np.array(train_info[2]).mean()
            train_speed = np.array(train_time).mean() / (train_batch_size * device_num)

            test_py_reader.start()

            test_batch_id = 0
            try:
                while True:
                    t1 = time.time()
                    loss, acc1, acc5 = exe.run(program=test_prog,
                                               fetch_list=test_fetch_list)
                    t2 = time.time()
                    period = t2 - t1
                    loss = np.mean(loss)
                    acc1 = np.mean(acc1)
                    acc5 = np.mean(acc5)
                    test_info[0].append(loss)
                    test_info[1].append(acc1)
                    test_info[2].append(acc5)
                    if test_batch_id % 100 == 0:
                        print("Pass {0},testbatch {1},loss {2}, \
                            acc1 {3},acc5 {4},time {5}"
                              .format(pass_id, test_batch_id, loss, acc1, acc5,
                                      "%2.2f sec" % period))
                        sys.stdout.flush()
                    test_batch_id += 1
            except fluid.core.EOFException:
                test_py_reader.reset()

            test_loss = np.array(test_info[0]).mean()
            test_acc1 = np.array(test_info[1]).mean()
            test_acc5 = np.array(test_info[2]).mean()

            print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
                  "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
                      pass_id, train_loss, train_acc1, train_acc5, test_loss,
                      test_acc1, test_acc5))
            sys.stdout.flush()

            model_path = os.path.join(model_save_dir + '/' + model_name,
                                      str(pass_id))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(exe, model_path, main_program=train_prog)

            # This is for continuous evaluation only
            if args.enable_ce and pass_id == args.num_epochs - 1:
                if device_num == 1:
                    # Use the mean cost/acc for training
                    print("kpis	train_cost	%s" % train_loss)
                    print("kpis	train_acc_top1	%s" % train_acc1)
                    print("kpis	train_acc_top5	%s" % train_acc5)
                    # Use the mean cost/acc for testing
                    print("kpis	test_cost	%s" % test_loss)
                    print("kpis	test_acc_top1	%s" % test_acc1)
                    print("kpis	test_acc_top5	%s" % test_acc5)
                    print("kpis	train_speed	%s" % train_speed)
                else:
                    # Use the mean cost/acc for training
                    print("kpis	train_cost_card%s	%s" % (device_num, train_loss))
                    print("kpis	train_acc_top1_card%s	%s" %
                          (device_num, train_acc1))
                    print("kpis	train_acc_top5_card%s	%s" %
                          (device_num, train_acc5))
                    # Use the mean cost/acc for testing
                    print("kpis	test_cost_card%s	%s" % (device_num, test_loss))
                    print("kpis	test_acc_top1_card%s	%s" % (device_num, test_acc1))
                    print("kpis	test_acc_top5_card%s	%s" % (device_num, test_acc5))
                    print("kpis	train_speed_card%s	%s" % (device_num, train_speed))
    else:
        print('multi training')
        train_fetch_list = [train_cost.name, train_acc1_0.name, train_acc5_0.name, train_acc1_1.name, train_acc5_1.name]
        test_fetch_list = [test_cost.name, test_acc1_0.name, test_acc5_0.name, test_acc1_1.name, test_acc5_1.name]

        params = models.__dict__[args.model]().params

        for pass_id in range(params["num_epochs"]):

            train_py_reader.start()

            train_info = [[], [], [], [], []]
            test_info = [[], [], [], [], []]
            train_time = []
            batch_id = 0
            try:
                while True:
                    t1 = time.time()
                    loss, acc1_0, acc5_0, acc1_1, acc5_1 = train_exe.run(fetch_list=train_fetch_list)
                    t2 = time.time()
                    period = t2 - t1
                    loss = np.mean(np.array(loss))
                    acc1_0 = np.mean(np.array(acc1_0))
                    acc5_0 = np.mean(np.array(acc5_0))
                    acc1_1 = np.mean(np.array(acc1_1))
                    acc5_1 = np.mean(np.array(acc5_1))
                    train_info[0].append(loss)
                    train_info[1].append(acc1_0)
                    train_info[2].append(acc5_0)
                    train_info[3].append(acc1_1)
                    train_info[4].append(acc5_1)
                    train_time.append(period)
                    if batch_id % 10 == 0:
                        #print("Pass {0}, trainbatch {1}, loss {2}, \
                        #    acc1_0 {3}, acc5_0 {4}, acc1_1 {5}, acc5_1 {6}, time {7}"
                        #      .format(pass_id, batch_id, loss, acc1_0, acc5_0, acc1_1, acc5_1,
                        #              "%2.2f sec" % period))
                        print("Pass {0}, trainbatch {1}, loss {2}, \
                            acc1_0 {3}, acc1_1 {4}, time {5}"
                              .format(pass_id, batch_id, loss, acc1_0, acc1_1, "%2.2f sec" % period))
                        sys.stdout.flush()
                    batch_id += 1
            except fluid.core.EOFException:
                train_py_reader.reset()

            train_loss = np.array(train_info[0]).mean()
            train_acc1_0 = np.array(train_info[1]).mean()
            train_acc5_0 = np.array(train_info[2]).mean()
            train_acc1_1 = np.array(train_info[3]).mean()
            train_acc5_1 = np.array(train_info[4]).mean()
            train_speed = np.array(train_time).mean() / (train_batch_size * device_num)

            test_py_reader.start()

            test_batch_id = 0
            try:
                while True:
                    t1 = time.time()
                    loss, acc1_0, acc5_0, acc1_1, acc5_1 = exe.run(program=test_prog,
                                               fetch_list=test_fetch_list)
                    t2 = time.time()
                    period = t2 - t1
                    loss = np.mean(loss)
                    acc1_0 = np.mean(acc1_0)
                    acc5_0 = np.mean(acc5_0)
                    acc1_1 = np.mean(acc1_1)
                    acc5_1 = np.mean(acc5_1)
                    test_info[0].append(loss)
                    test_info[1].append(acc1_0)
                    test_info[2].append(acc5_0)
                    test_info[3].append(acc1_1)
                    test_info[4].append(acc5_1)
                    if test_batch_id % 10 == 0:
                        #print("Pass {0},testbatch {1},loss {2}, \
                        #    acc1_0 {3},acc5_0 {4},acc1_1 {5},acc5_1 {6},time {7}"
                        #      .format(pass_id, test_batch_id, loss, acc1_0, acc5_0, acc1_1, acc5_1,
                        #              "%2.2f sec" % period))
                        print("Pass {0},testbatch {1},loss {2}, \
                            acc1_0 {3}, acc1_1 {4}, time {5}"
                              .format(pass_id, test_batch_id, loss, acc1_0, acc1_1, "%2.2f sec" % period))
                        sys.stdout.flush()
                    test_batch_id += 1
            except fluid.core.EOFException:
                test_py_reader.reset()

            test_loss = np.array(test_info[0]).mean()
            test_acc1_0 = np.array(test_info[1]).mean()
            test_acc5_0 = np.array(test_info[2]).mean()
            test_acc1_1 = np.array(test_info[3]).mean()
            test_acc5_1 = np.array(test_info[4]).mean()

            #print("End pass {0}, train_loss {1}, train_acc1_0 {2}, train_acc5_0 {3}, train_acc1_1 {4}, train_acc5_1 {5}, "
            #      "test_loss {6}, test_acc1_0 {7}, test_acc5_0 {8}, test_acc1_1 {9}, test_acc5_1 {10}".format(
            #          pass_id, train_loss, train_acc1_0, train_acc5_0, train_acc1_1, train_acc5_1, test_loss,
            #          test_acc1_0, test_acc5_0, test_acc1_1, test_acc5_1))
            print("End pass {0}, train_loss {1}, train_acc1_0 {2}, train_acc1_1 {3}, "
                  "test_loss {4}, test_acc1_0 {5}, test_acc1_1 {6}".format(
                      pass_id, train_loss, train_acc1_0, train_acc1_1, test_loss,
                      test_acc1_0, test_acc1_1))
            sys.stdout.flush()

            model_path = os.path.join(model_save_dir + '/' + model_name,
                                      str(pass_id))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(exe, model_path, main_program=train_prog)
指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/Paddle#15979
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7