You need to sign in or sign up before continuing.
运行fl_server报错“PaddleCheckError: Expected optimize_blocks.size() >= 1, but received optimize_blocks.size():0 < 1:1.”
Created by: XDUXK
我想把现有的一个目标检测模型SSD (https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ssd) 的训练过程改写成FL的方式,目前运行fl_master没有提示出错,但是之后运行fl_server的时候报错“PaddleCheckError: Expected optimize_blocks.size() >= 1, but received optimize_blocks.size():0 < 1:1.”,不知道是哪里出的问题啊?相关的代码贴在了下面
1. fl_master.py
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
import paddle_fl as fl
from paddle_fl.core.master.job_generator import JobGenerator
from paddle_fl.core.strategy.fl_strategy_base import FLStrategyFactory
batch_size = 2 # 64
train_images = 10 # 16551
lr = 0.001
lr_epochs = [40, 60, 80, 100]
lr_decay = [1, 0.5, 0.25, 0.1, 0.01]
image_shape = [3, 300, 300]
class_num = 6 # 21
ap_version = '11point'
class NetSSD:
def __init__(self, image, class_num, image_shape):
self.img = image
self.num_classes = class_num
self.img_shape = image_shape
def ssd_net(self, scale=1.0):
# 300x300
tmp = self.conv_bn(self.img, 3, int(32 * scale), 2, 1)
# 150x150
tmp = self.depthwise_separable(tmp, 32, 64, 32, 1, scale)
tmp = self.depthwise_separable(tmp, 64, 128, 64, 2, scale)
# 75x75
tmp = self.depthwise_separable(tmp, 128, 128, 128, 1, scale)
tmp = self.depthwise_separable(tmp, 128, 256, 128, 2, scale)
# 38x38
tmp = self.depthwise_separable(tmp, 256, 256, 256, 1, scale)
tmp = self.depthwise_separable(tmp, 256, 512, 256, 2, scale)
# 19x19
for i in range(5):
tmp = self.depthwise_separable(tmp, 512, 512, 512, 1, scale)
module11 = tmp
tmp = self.depthwise_separable(tmp, 512, 1024, 512, 2, scale)
# 10x10
module13 = self.depthwise_separable(tmp, 1024, 1024, 1024, 1, scale)
module14 = self.extra_block(module13, 256, 512, 1, 2)
# 5x5
module15 = self.extra_block(module14, 128, 256, 1, 2)
# 3x3
module16 = self.extra_block(module15, 128, 256, 1, 2)
# 2x2
module17 = self.extra_block(module16, 64, 128, 1, 2)
mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
inputs=[
module11, module13, module14, module15, module16, module17
],
image=self.img,
num_classes=self.num_classes,
min_ratio=20,
max_ratio=90,
min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0],
max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0],
aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.],
[2., 3.]],
base_size=self.img_shape[2],
offset=0.5,
flip=True)
self.loss = fluid.layers.ssd_loss(mbox_locs, mbox_confs, gt_box, gt_label, box,
box_var)
self.loss = fluid.layers.reduce_sum(self.loss)
self.startup_program = fluid.default_startup_program()
def conv_bn(self,
input,
filter_size,
num_filters,
stride,
padding,
num_groups=1,
act='relu',
use_cudnn=True):
parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=parameter_attr,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act)
def depthwise_separable(self, input, num_filters1, num_filters2, num_groups,
stride, scale):
depthwise_conv = self.conv_bn(
input=input,
filter_size=3,
num_filters=int(num_filters1 * scale),
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
use_cudnn=False)
pointwise_conv = self.conv_bn(
input=depthwise_conv,
filter_size=1,
num_filters=int(num_filters2 * scale),
stride=1,
padding=0)
return pointwise_conv
def extra_block(self, input, num_filters1, num_filters2, num_groups, stride):
# 1x1 conv
pointwise_conv = self.conv_bn(
input=input,
filter_size=1,
num_filters=int(num_filters1),
stride=1,
num_groups=int(num_groups),
padding=0)
# 3x3 conv
normal_conv = self.conv_bn(
input=pointwise_conv,
filter_size=3,
num_filters=int(num_filters2),
stride=2,
num_groups=int(num_groups),
padding=1)
return normal_conv
def optimizer_setting():
iters = train_images // batch_size
boundaries = [i * iters for i in lr_epochs]
values = [ i * lr for i in lr_decay]
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005), )
return optimizer
py_reader = fluid.layers.py_reader(
capacity=64,
shapes=[[-1] + image_shape, [-1, 4], [-1, 1], [-1, 1]],
lod_levels=[0, 1, 1, 1],
dtypes=["float32", "float32", "int32", "int32"],
use_double_buffer=True)
image, gt_box, gt_label, difficult = fluid.layers.read_file(py_reader)
ssd_model = NetSSD(image, class_num, image_shape)
ssd_model.ssd_net()
job_generator = JobGenerator()
optimizer = optimizer_setting()
job_generator.set_optimizer(optimizer)
job_generator.set_losses([ssd_model.loss])
job_generator.set_startup_program(ssd_model.startup_program)
job_generator.set_infer_feed_and_target_names(
[ssd_model.img.name], [ssd_model.loss.name])
build_strategy = FLStrategyFactory()
build_strategy.fed_avg = True
build_strategy.inner_step = 1
strategy = build_strategy.create_fl_strategy()
# endpoints will be collected through the cluster
# in this example, we suppose endpoints have been collected
endpoints = ["127.0.0.1:8181"]
output = "fl_job_config"
job_generator.generate_fl_job(
strategy, server_endpoints=endpoints, worker_num=2, output=output)
# fl_job_config will be dispatched to workers
2. fl_server.py
import paddle_fl as fl
import paddle.fluid as fluid
from paddle_fl.core.server.fl_server import FLServer
from paddle_fl.core.master.fl_job import FLRunTimeJob
server = FLServer()
server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
server.set_server_job(job)
server.start()
3. fl_trainer.py
from paddle_fl.core.trainer.fl_trainer import FLTrainerFactory
from paddle_fl.core.master.fl_job import FLRunTimeJob
from paddle_fl.reader.gru4rec_reader import Gru4rec_Reader
import paddle.fluid as fluid
import numpy as np
import sys
import os
import logging
import ssd_reader
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
trainer_id = int(sys.argv[1]) # trainer id for each guest
use_gpu = False
batch_size = 64
epoch_num =120
dataset = 'mydata'
model_save_dir = 'model/model_trainer_%d' % trainer_id
pretrained_model = 'pretrained/ssd_mobilenet_v1_coco/'
ap_version = '11point'
image_shape = [3, 300, 300]
mean_BGR = [127.5, 127.5, 127.5]
data_dir = 'data/data_%d/%s' % (trainer_id, dataset)
label_file = 'label_list.txt'
train_file_list = 'all.txt'
data_args = ssd_reader.Settings(
dataset=dataset,
data_dir=data_dir,
label_file=label_file,
resize_h=image_shape[1],
resize_w=image_shape[2],
mean_value=mean_BGR,
apply_distort=True,
apply_expand=True,
ap_version = ap_version)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
trainer = FLTrainerFactory().create_fl_trainer(job)
trainer.start()
train_reader = ssd_reader.train(data_args,
train_file_list,
batch_size,
shuffle=True,
use_multiprocess=False,
num_workers=1,
)
step_i = 0
while not trainer.stop():
step_i += 1
print("batch %d start train" % (step_i))
for data in train_reader():
#print(np.array(data['src_wordseq']))
ret_avg_cost = trainer.run(feed=data,
fetch=[])
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
print("ppl:%.3f" % (newest_ppl))
save_dir = (model_save_dir + "/epoch_%d") % step_i
if trainer_id == 0:
print("start save")
trainer.save_inference_program(save_dir)
if step_i >= 40:
break