fluid.io.load_params后loss升高
Created by: tcwm1
基于models下的icnet进行训练,训练3个epoch后任务被杀掉,新起了任务通过fluid.io.load_params将之前保存的参数加载继续训练,但是加载后重训头几轮 loss很大,跟不加载 直接重训的效果一样,感觉load没有生效, 以下是第一次训练的代码, 第二次训练传入了init_model参数,同时将global_step = _decay_step_counter() 调整为 global_step = _decay_step_counter(19000),其中19000为第一次训练的迭代数
"""Trainer for ICNet model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from icnet import icnet
import cityscape
import argparse
import functools
import sys
import os
import time
import paddle.fluid as fluid
import numpy as np
from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
if 'ce_mode' in os.environ:
np.random.seed(10)
fluid.default_startup_program().random_seed = 90
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 16, "Minibatch size.")
add_arg('checkpoint_path', str, None, "Checkpoint svae path.")
add_arg('init_model', str, None, "Pretrain model path.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('random_mirror', bool, True, "Whether prepare by random mirror.")
add_arg('random_scaling', bool, True, "Whether prepare by random scaling.")
# yapf: enable
LAMBDA1 = 0.16
LAMBDA2 = 0.4
LAMBDA3 = 1.0
LEARNING_RATE = 0.01
#LEARNING_RATE = 0.003
#LEARNING_RATE = 0.0001
POWER = 0.9
LOG_PERIOD = 1
CHECKPOINT_PERIOD = 100
SINGLE_STEP = 6000
TOTAL_EPOCH = 12
TOTAL_STEP = SINGLE_STEP * TOTAL_TOTAL_EPOCH
no_grad_set = []
def create_loss(predict, label, mask, num_classes):
predict = fluid.layers.transpose(predict, perm=[0, 2, 3, 1])
predict = fluid.layers.reshape(predict, shape=[-1, num_classes])
label = fluid.layers.reshape(label, shape=[-1, 1])
predict = fluid.layers.gather(predict, mask)
label = fluid.layers.gather(label, mask)
label = fluid.layers.cast(label, dtype="int64")
loss = fluid.layers.softmax_with_cross_entropy(predict, label)
no_grad_set.append(label.name)
return fluid.layers.reduce_mean(loss)
def poly_decay():
global_step = _decay_step_counter()
with init_on_cpu():
decayed_lr = LEARNING_RATE * (fluid.layers.pow(
(1 - global_step / TOTAL_STEP), POWER))
return decayed_lr
# default parameters
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
port = os.getenv("PADDLE_PORT", "6174")
pserver_ips = os.getenv("PADDLE_PSERVERS")
current_endpoint = os.getenv("POD_IP") + ":" + port
def train(args):
data_shape = cityscape.train_data_shape()
num_classes = cityscape.num_classes()
# define network
images = fluid.layers.data(name='image', shape=data_shape, dtype='float32')
label_sub1 = fluid.layers.data(name='label_sub1', shape=[1], dtype='int32')
label_sub2 = fluid.layers.data(name='label_sub2', shape=[1], dtype='int32')
label_sub4 = fluid.layers.data(name='label_sub4', shape=[1], dtype='int32')
mask_sub1 = fluid.layers.data(name='mask_sub1', shape=[-1], dtype='int32')
mask_sub2 = fluid.layers.data(name='mask_sub2', shape=[-1], dtype='int32')
mask_sub4 = fluid.layers.data(name='mask_sub4', shape=[-1], dtype='int32')
sub4_out, sub24_out, sub124_out = icnet(
images, num_classes, np.array(data_shape[1:]).astype("float32"))
loss_sub4 = create_loss(sub4_out, label_sub4, mask_sub4, num_classes)
loss_sub24 = create_loss(sub24_out, label_sub2, mask_sub2, num_classes)
loss_sub124 = create_loss(sub124_out, label_sub1, mask_sub1, num_classes)
reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124
regularizer = fluid.regularizer.L2Decay(0.0001)
optimizer = fluid.optimizer.Momentum(
learning_rate=poly_decay(), momentum=0.9, regularization=regularizer)
_, params_grads = optimizer.minimize(reduced_loss, no_grad_set=no_grad_set)
# prepare environment
place = fluid.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
if training_role == "PSERVER":
place = fluid.CPUPlace()
exe = fluid.Executor(place)
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
print(pserver_endpoints)
print(trainers)
print(trainer_id)
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
print("pserver")
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
return
print("trainer")
exe.run(fluid.default_startup_program())
if args.init_model is not None:
print("load model from: %s" % args.init_model)
sys.stdout.flush()
fluid.io.load_params(exe, args.init_model)
iter_id = 0
t_loss = 0.
sub4_loss = 0.
sub24_loss = 0.
sub124_loss = 0.
train_reader = cityscape.train(
args.batch_size, flip=args.random_mirror, scaling=args.random_scaling, trainer_id = trainer_id, trainer_count = trainers)
start_time = time.time()
pass_id = 0
while True:
print("pass %d" % (pass_id))
if pass_id > TOTAL_EPOCH:
end_time = time.time()
print("kpis train_duration %f" % (end_time - start_time))
return
pass_id += 1
# train a pass
iter_id = 0
for data in train_reader():
print("get data")
iter_id += 1
results = exe.run(
feed=get_feeder_data(data, place),
fetch_list=[reduced_loss, loss_sub4, loss_sub24, loss_sub124])
t_loss += results[0]
sub4_loss += results[1]
sub24_loss += results[2]
sub124_loss += results[3]
# training log
if iter_id % LOG_PERIOD == 0:
print(
"Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f"
% (iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD,
sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD))
print("kpis train_cost %f" % (t_loss / LOG_PERIOD))
t_loss = 0.
sub4_loss = 0.
sub24_loss = 0.
sub124_loss = 0.
sys.stdout.flush()
if trainer_id == 0 and iter_id % CHECKPOINT_PERIOD == 0 and args.checkpoint_path is not None:
dir_name = args.checkpoint_path + "/" + str(pass_id) + "_" + str(iter_id)
fluid.io.save_persistables(exe, dirname=dir_name)
print("Saved checkpoint: %s" % (dir_name))
def main():
import os
import sys
args = parser.parse_args()
print_arguments(args)
train(args)
if __name__ == "__main__":
main()