分布式训练报var not in this block错误 (#19898) · Issue · PaddlePaddle / Paddle

分布式训练报var not in this block错误

Created by: XDUXK

我根据paddle上一个分布式训练的sample （https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/cluster_quick_start.html）改写了自己的一个训练。其中一个trainer的代码如下，运行时报错：“ValueError: var image not in this block”，不知道是什么原因造成的？怎么解决啊？多谢！

import os
import time

import matplotlib
import numpy as np
import paddle
import paddle.fluid as fluid

from reader_creator import reader_creator

matplotlib.use('agg')

y_dim = 40
z_dim = 100
img_dir = './att_data'
list_file = './data/att_data/trainer.list'
output_dir = './out'
img_size = 64
crop_type = 'Centor'
crop_size=64

batch_size = 4
attack_target = 1
fake_target = 3
EPOCH_NUM = 200
USE_GPU = False


def get_params(program, prefix):
    all_params = program.global_block().all_parameters()
    return [t.name for t in all_params if t.name.startswith(prefix)]

def Discriminator(images, name='D'):
    with fluid.unique_name.guard(name + '/'):
        y = fluid.layers.reshape(x=images, shape=[-1, 1, img_size, img_size])

        # 第一层卷积池化层
        y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=32, pool_size=(3, 3),
                                               pool_stride=(3, 3), act='tanh')
        # 第二层卷积池化层
        y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=64, pool_size=(2, 2),
                                            pool_stride=(2, 2), act='tanh')
        # 第三层卷积池化层
        y = fluid.nets.simple_img_conv_pool(input=y, filter_size=(5, 5), num_filters=128, pool_size=(2, 2),
                                            pool_stride=(2, 2), act='tanh')
        # Densely Connected Layer
        y = fluid.layers.reshape(x=y, shape=[-1, 128 * 4])
        y = fluid.layers.fc(input=y, size=400, act='tanh')

        # Readout Layer
        y = fluid.layers.fc(input=y, size=40, act='softmax')
    return y


train_d_real = fluid.Program()
startup = fluid.Program()


with fluid.program_guard(train_d_real, startup):
    real_image = fluid.layers.data('image', shape=[1, img_size, img_size])
    label = fluid.layers.data(
        name='label', shape=[-1, y_dim], dtype='float32')
    p_real = Discriminator(real_image)
    real_cost = fluid.layers.sigmoid_cross_entropy_with_logits(p_real, label)
    real_avg_cost = fluid.layers.mean(real_cost)
    d_params = get_params(train_d_real, 'D')
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=2e-4)
    optimizer.minimize(real_avg_cost, parameter_list=d_params)


reader_cre = reader_creator(image_dir=img_dir, list_filename=list_file)
reader = reader_creator.make_reader(reader_cre, image_size=img_size,
                                    crop_type=crop_type, crop_size=crop_size, return_label=True)
face_generator = paddle.batch(paddle.reader.shuffle(reader, 30000), batch_size=2*batch_size)


def train():
    place = fluid.CPUPlace()
    if USE_GPU:
        place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)

    # fetch distributed training environment setting
    training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
    port = os.getenv("PADDLE_PSERVER_PORT", "6666")
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "127.0.0.1")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    trainers = int(os.getenv("PADDLE_TRAINERS", "2"))
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "127.0.0.1") + ":" + port

    t = fluid.DistributeTranspiler()
    t.transpile(
        trainer_id=trainer_id,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=True,
        startup_program=startup
    )

    if training_role == "PSERVER":
        pserver_prog = t.get_pserver_program(current_endpoint)
        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
        exe.run(startup_prog)
        exe.run(pserver_prog)
    elif training_role == "TRAINER":
        trainer_prog = t.get_trainer_program()
        exe.run(fluid.default_startup_program())

        for pass_id in range(EPOCH_NUM):
            start_time = time.time()
            for i, batch_data in enumerate(face_generator()):
                if len(batch_data) != 2*batch_size:
                    print('len(batch_data)=%d, batch_size=%d' % (len(batch_data), batch_size))
                    continue
                data = []
                for index, ele in enumerate(batch_data):
                    if index % trainers == trainer_id:
                        data.append(ele)
                real_image = np.array(list(map(lambda x: x[0], data))).reshape(
                    -1, 64 * 64).astype('float32')
                real_batch_labels = np.array(list(map(lambda x: x[1], data))).reshape(-1, 1).astype('float32')
                real_batch_labels_size = np.size(real_batch_labels, axis=0)  # 每个batch中包含的label的个数
                assert real_batch_labels_size == batch_size, 'real_batch_labels_size != batch_size'
                real_labels = np.zeros(shape=[real_batch_labels_size, y_dim], dtype='float32')
                for i in range(real_batch_labels_size):
                    real_labels[i][int(real_batch_labels[i][0])] = 1.0

                r_real = exe.run(program=trainer_prog, fetch_list=[real_avg_cost],
                                 feed={'image': np.array(real_image), 'label': real_labels})

            print("Pass:%d,real_acg_cost:%f," % (pass_id, r_real[0][0]))

            end_time = time.time()
            one_pass_time = end_time - start_time
            print("This pass has taken %fs" % (one_pass_time))
        # destory the resource of current trainer node in pserver server node
        exe.close()
    else:
        raise AssertionError("PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")

train()

PaddlePaddle / Paddle 1 年多 前同步成功

分布式训练报var not in this block错误

PaddlePaddle / Paddle
1 年多前同步成功