分布式训练报var not in this block错误
Created by: XDUXK
我根据paddle上一个分布式训练的sample (https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/cluster_quick_start.html) 改写了自己的一个训练。其中一个trainer的代码如下,运行时报错:“ValueError: var image not in this block”,不知道是什么原因造成的?怎么解决啊?多谢!
import os
import time
import matplotlib
import numpy as np
import paddle
import paddle.fluid as fluid
from reader_creator import reader_creator
matplotlib.use('agg')
y_dim = 40
z_dim = 100
img_dir = './att_data'
list_file = './data/att_data/trainer.list'
output_dir = './out'
img_size = 64
crop_type = 'Centor'
crop_size=64
batch_size = 4
attack_target = 1
fake_target = 3
EPOCH_NUM = 200
USE_GPU = False
def get_params(program, prefix):
all_params = program.global_block().all_parameters()
return [t.name for t in all_params if t.name.startswith(prefix)]
def Discriminator(images, name='D'):
with fluid.unique_name.guard(name + '/'):
y = fluid.layers.reshape(x=images, shape=[-1, 1, img_size, img_size])
# 第一层卷积池化层
y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=32, pool_size=(3, 3),
pool_stride=(3, 3), act='tanh')
# 第二层卷积池化层
y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=64, pool_size=(2, 2),
pool_stride=(2, 2), act='tanh')
# 第三层卷积池化层
y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=128, pool_size=(2, 2),
pool_stride=(2, 2), act='tanh')
# Densely Connected Layer
y = fluid.layers.reshape(x=y, shape=[-1, 128 * 4])
y = fluid.layers.fc(input=y, size=400, act='tanh')
# Readout Layer
y = fluid.layers.fc(input=y, size=40, act='softmax')
return y
train_d_real = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_d_real, startup):
real_image = fluid.layers.data('image', shape=[1, img_size, img_size])
label = fluid.layers.data(
name='label', shape=[-1, y_dim], dtype='float32')
p_real = Discriminator(real_image)
real_cost = fluid.layers.sigmoid_cross_entropy_with_logits(p_real, label)
real_avg_cost = fluid.layers.mean(real_cost)
d_params = get_params(train_d_real, 'D')
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=2e-4)
optimizer.minimize(real_avg_cost, parameter_list=d_params)
reader_cre = reader_creator(image_dir=img_dir, list_filename=list_file)
reader = reader_creator.make_reader(reader_cre, image_size=img_size,
crop_type=crop_type, crop_size=crop_size, return_label=True)
face_generator = paddle.batch(paddle.reader.shuffle(reader, 30000), batch_size=2*batch_size)
def train():
place = fluid.CPUPlace()
if USE_GPU:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
# fetch distributed training environment setting
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
port = os.getenv("PADDLE_PSERVER_PORT", "6666")
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "127.0.0.1")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
trainers = int(os.getenv("PADDLE_TRAINERS", "2"))
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "127.0.0.1") + ":" + port
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=True,
startup_program=startup
)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(startup_prog)
exe.run(pserver_prog)
elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
exe.run(fluid.default_startup_program())
for pass_id in range(EPOCH_NUM):
start_time = time.time()
for i, batch_data in enumerate(face_generator()):
if len(batch_data) != 2*batch_size:
print('len(batch_data)=%d, batch_size=%d' % (len(batch_data), batch_size))
continue
data = []
for index, ele in enumerate(batch_data):
if index % trainers == trainer_id:
data.append(ele)
real_image = np.array(list(map(lambda x: x[0], data))).reshape(
-1, 64 * 64).astype('float32')
real_batch_labels = np.array(list(map(lambda x: x[1], data))).reshape(-1, 1).astype('float32')
real_batch_labels_size = np.size(real_batch_labels, axis=0) # 每个batch中包含的label的个数
assert real_batch_labels_size == batch_size, 'real_batch_labels_size != batch_size'
real_labels = np.zeros(shape=[real_batch_labels_size, y_dim], dtype='float32')
for i in range(real_batch_labels_size):
real_labels[i][int(real_batch_labels[i][0])] = 1.0
r_real = exe.run(program=trainer_prog, fetch_list=[real_avg_cost],
feed={'image': np.array(real_image), 'label': real_labels})
print("Pass:%d,real_acg_cost:%f," % (pass_id, r_real[0][0]))
end_time = time.time()
one_pass_time = end_time - start_time
print("This pass has taken %fs" % (one_pass_time))
# destory the resource of current trainer node in pserver server node
exe.close()
else:
raise AssertionError("PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")
train()