Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • Paddle
  • Issue
  • #14290

P
Paddle
  • 项目概览

PaddlePaddle / Paddle
大约 2 年 前同步成功

通知 2325
Star 20933
Fork 5424
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 1423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
  • Wiki 0
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
P
Paddle
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 1,423
    • Issue 1,423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
    • 合并请求 543
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 0
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 11月 07, 2018 by saxon_zh@saxon_zhGuest

fluid.io.load_params后loss升高

Created by: tcwm1

基于models下的icnet进行训练,训练3个epoch后任务被杀掉,新起了任务通过fluid.io.load_params将之前保存的参数加载继续训练,但是加载后重训头几轮 loss很大,跟不加载 直接重训的效果一样,感觉load没有生效, 以下是第一次训练的代码, 第二次训练传入了init_model参数,同时将global_step = _decay_step_counter() 调整为 global_step = _decay_step_counter(19000),其中19000为第一次训练的迭代数

"""Trainer for ICNet model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from icnet import icnet
import cityscape
import argparse
import functools
import sys
import os
import time
import paddle.fluid as fluid
import numpy as np
from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu

if 'ce_mode' in os.environ:
    np.random.seed(10)
    fluid.default_startup_program().random_seed = 90

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size',        int,   16,         "Minibatch size.")
add_arg('checkpoint_path',   str,   None,       "Checkpoint svae path.")
add_arg('init_model',        str,   None,       "Pretrain model path.")
add_arg('use_gpu',           bool,  True,       "Whether use GPU to train.")
add_arg('random_mirror',     bool,  True,       "Whether prepare by random mirror.")
add_arg('random_scaling',    bool,  True,       "Whether prepare by random scaling.")
# yapf: enable

LAMBDA1 = 0.16
LAMBDA2 = 0.4
LAMBDA3 = 1.0
LEARNING_RATE = 0.01
#LEARNING_RATE = 0.003
#LEARNING_RATE = 0.0001
POWER = 0.9
LOG_PERIOD = 1
CHECKPOINT_PERIOD = 100
SINGLE_STEP = 6000
TOTAL_EPOCH = 12
TOTAL_STEP = SINGLE_STEP * TOTAL_TOTAL_EPOCH
no_grad_set = []


def create_loss(predict, label, mask, num_classes):
    predict = fluid.layers.transpose(predict, perm=[0, 2, 3, 1])
    predict = fluid.layers.reshape(predict, shape=[-1, num_classes])
    label = fluid.layers.reshape(label, shape=[-1, 1])
    predict = fluid.layers.gather(predict, mask)
    label = fluid.layers.gather(label, mask)
    label = fluid.layers.cast(label, dtype="int64")
    loss = fluid.layers.softmax_with_cross_entropy(predict, label)
    no_grad_set.append(label.name)
    return fluid.layers.reduce_mean(loss)


def poly_decay():
    global_step = _decay_step_counter()
    with init_on_cpu():
        decayed_lr = LEARNING_RATE * (fluid.layers.pow(
            (1 - global_step / TOTAL_STEP), POWER))
    return decayed_lr

# default parameters
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
port = os.getenv("PADDLE_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVERS")
current_endpoint = os.getenv("POD_IP") + ":" + port


def train(args):
    data_shape = cityscape.train_data_shape()
    num_classes = cityscape.num_classes()
    # define network
    images = fluid.layers.data(name='image', shape=data_shape, dtype='float32')
    label_sub1 = fluid.layers.data(name='label_sub1', shape=[1], dtype='int32')
    label_sub2 = fluid.layers.data(name='label_sub2', shape=[1], dtype='int32')
    label_sub4 = fluid.layers.data(name='label_sub4', shape=[1], dtype='int32')
    mask_sub1 = fluid.layers.data(name='mask_sub1', shape=[-1], dtype='int32')
    mask_sub2 = fluid.layers.data(name='mask_sub2', shape=[-1], dtype='int32')
    mask_sub4 = fluid.layers.data(name='mask_sub4', shape=[-1], dtype='int32')

    sub4_out, sub24_out, sub124_out = icnet(
        images, num_classes, np.array(data_shape[1:]).astype("float32"))
    loss_sub4 = create_loss(sub4_out, label_sub4, mask_sub4, num_classes)
    loss_sub24 = create_loss(sub24_out, label_sub2, mask_sub2, num_classes)
    loss_sub124 = create_loss(sub124_out, label_sub1, mask_sub1, num_classes)
    reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124

    regularizer = fluid.regularizer.L2Decay(0.0001)
    optimizer = fluid.optimizer.Momentum(
        learning_rate=poly_decay(), momentum=0.9, regularization=regularizer)
    _, params_grads = optimizer.minimize(reduced_loss, no_grad_set=no_grad_set)

    # prepare environment
    place = fluid.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)

    if training_role == "PSERVER":
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)

    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
    print(pserver_endpoints)
    print(trainers)
    print(trainer_id)
    t = fluid.DistributeTranspiler()
    t.transpile(
        trainer_id,
        pservers=pserver_endpoints,
        trainers=trainers)
    if training_role == "PSERVER":
        print("pserver")
        pserver_prog = t.get_pserver_program(current_endpoint)
        pserver_startup = t.get_startup_program(current_endpoint,
                                                pserver_prog)
        exe.run(pserver_startup)
        exe.run(pserver_prog)
        return

    print("trainer")
    exe.run(fluid.default_startup_program())

    if args.init_model is not None:
        print("load model from: %s" % args.init_model)
        sys.stdout.flush()
        fluid.io.load_params(exe, args.init_model)

    iter_id = 0
    t_loss = 0.
    sub4_loss = 0.
    sub24_loss = 0.
    sub124_loss = 0.
    train_reader = cityscape.train(
        args.batch_size, flip=args.random_mirror, scaling=args.random_scaling, trainer_id = trainer_id, trainer_count = trainers)
    start_time = time.time()
    pass_id = 0
    while True:
        print("pass %d" % (pass_id))
        if pass_id > TOTAL_EPOCH:
            end_time = time.time()
            print("kpis    train_duration    %f" % (end_time - start_time))
            return

        pass_id += 1
        # train a pass
        iter_id = 0
        for data in train_reader():
            print("get data")
            iter_id += 1
            results = exe.run(
                feed=get_feeder_data(data, place),
                fetch_list=[reduced_loss, loss_sub4, loss_sub24, loss_sub124])
            t_loss += results[0]
            sub4_loss += results[1]
            sub24_loss += results[2]
            sub124_loss += results[3]
            # training log
            if iter_id % LOG_PERIOD == 0:
                print(
                    "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f"
                    % (iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD,
                       sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD))
                print("kpis    train_cost    %f" % (t_loss / LOG_PERIOD))

                t_loss = 0.
                sub4_loss = 0.
                sub24_loss = 0.
                sub124_loss = 0.
                sys.stdout.flush()

            if trainer_id == 0 and iter_id % CHECKPOINT_PERIOD == 0 and args.checkpoint_path is not None:
                dir_name = args.checkpoint_path + "/" + str(pass_id) + "_" + str(iter_id)
                fluid.io.save_persistables(exe, dirname=dir_name)
                print("Saved checkpoint: %s" % (dir_name))


def main():
    import os
    import sys
    args = parser.parse_args()
    print_arguments(args)
    train(args)


if __name__ == "__main__":
    main()
指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/Paddle#14290
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7