修改object_detection模型 在多机下运行,报错
Created by: ccmeteorljh
paddle version 0.14 模型:MobileNet-SSD 代码库地址:https://github.com/PaddlePaddle/models/tree/develop/fluid/object_detection 作业运行配置: 2个pserver,4个trainer,每个trainer单卡 启动脚本:
FLAGS_rpc_deadline=3000000 python -u thirdparty/train_dist.py --batch_size=64 --dataset='pascalvoc' --data_dir='./thirdparty/data/pascalvoc' --pretrained_model='./thirdparty/pretrained/ssd_mobilenet_v1_coco/' --num_passes=1 --update_method=pserver --async_mode=False --for_model_ce=True --iterations=20
修改train.py 文件如下:
add_arg('update_method', str, 'local', 'Choose parameter update method')
add_arg('async_mode', bool, False, "Whether start pserver in async mode to support ASGD")
def dist_transpile(trainer_id, args):
if trainer_id < 0:
return None, None
# the port of all pservers, needed by both trainer and pserver
port = os.getenv("PADDLE_PORT", "6174")
# comma separated ips of all pservers, needed by trainer and
# pserver
pserver_ips = os.getenv("PADDLE_PSERVERS", "")
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
# total number of workers/trainers in the job, needed by
# trainer and pserver
trainers = int(os.getenv("TRAINERS"))
# the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("POD_IP", "") + ":" + port
# the role, should be either PSERVER or TRAINER
training_role = os.getenv("TRAINING_ROLE")
t = distribute_transpiler.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers
,sync_mode=not args.async_mode)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(current_endpoint,
pserver_program)
return pserver_program, pserver_startup_program
elif training_role == "TRAINER":
train_program = t.get_trainer_program()
return train_program, fluid.default_startup_program()
else:
raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
def train(args,
train_file_list,
val_file_list,
data_args,
learning_rate,
batch_size,
num_passes,
model_save_dir,
pretrained_model=None):
image_shape = [3, data_args.resize_h, data_args.resize_w]
if 'coco' in data_args.dataset:
num_classes = 91
elif 'pascalvoc' in data_args.dataset:
num_classes = 21
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
if training_role == "PSERVER" or (not args.use_gpu):
place = fluid.CPUPlace()
devices_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
else:
place = fluid.CUDAPlace(0)
devices_num = fluid.core.get_cuda_device_count()
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
gt_box = fluid.layers.data(
name='gt_box', shape=[4], dtype='float32', lod_level=1)
gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1)
difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=args.nms_threshold)
loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
box_var)
loss = fluid.layers.reduce_sum(loss)
test_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(test_program):
map_eval = fluid.evaluator.DetectionMAP(
nmsed_out,
gt_label,
gt_box,
difficult,
num_classes,
overlap_threshold=0.5,
evaluate_difficult=False,
ap_version=args.ap_version)
if 'coco' in data_args.dataset:
# learning rate decay in 12, 19 pass, respectively
if '2014' in train_file_list:
epocs = 82783 / batch_size
boundaries = [epocs * 12, epocs * 19]
elif '2017' in train_file_list:
epocs = 118287 / batch_size
boundaries = [epocs * 12, epocs * 19]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25
]
elif 'pascalvoc' in data_args.dataset:
epocs = 19200 / batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01
]
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005), )
optimizer.minimize(loss)
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile(trainer_id, args)
if not train_prog:
raise Exception(
"Must configure correct environments to run dist train.")
if os.getenv("TRAINING_ROLE") == "PSERVER":
place = core.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(train_prog)
return
elif args.update_method == "nccl2":
train_prog = fluid.default_main_program()
startup_prog = fluid.default_startup_program()
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
else:
train_prog = fluid.default_main_program()
startup_prog = fluid.default_startup_program()
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
strategy.use_cuda = args.use_gpu
...................
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
data_dir = args.data_dir
label_file = 'label_list'
model_save_dir = args.model_save_dir
train_file_list = 'trainval.txt'
val_file_list = 'test.txt'
if 'coco' in args.dataset:
data_dir = 'data/coco'
if '2014' in args.dataset:
train_file_list = 'annotations/instances_train2014.json'
val_file_list = 'annotations/instances_val2014.json'
elif '2017' in args.dataset:
train_file_list = 'annotations/instances_train2017.json'
val_file_list = 'annotations/instances_val2017.json'
data_args = reader.Settings(
dataset=args.dataset,
data_dir=data_dir,
label_file=label_file,
resize_h=args.resize_h,
resize_w=args.resize_w,
mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R],
apply_distort=args.apply_distort,
apply_expand=args.apply_expand,
ap_version = args.ap_version,
toy=args.is_toy)
train(
args,
train_file_list=train_file_list,
val_file_list=val_file_list,
data_args=data_args,
learning_rate=args.learning_rate,
batch_size=args.batch_size,
num_passes=args.num_passes,
model_save_dir=model_save_dir,
pretrained_model=args.pretrained_model)