while_op里的梯度回传问题 (#15567) · Issue · PaddlePaddle / Paddle

while_op里的梯度回传问题

Created by: Ashleychen

def train_program(self):
    image_lod = fluid.layers.data(name = 'image_lod', dtype = 'float32',
        shape = [784], lod_level = 1)
    label_lod = fluid.layers.data(name = 'label_lod', dtype = 'int',
        shape = [1], lod_level = 1)
    last_label_lod = fluid.layers.data(name = 'last_label_lod', dtype = 'float32',
        shape = [1], lod_level = 1)
    image_lod_rank_table = fluid.layers.control_flow.lod_rank_table(image_lod)
    label_lod_rank_table = fluid.layers.control_flow.lod_rank_table(label_lod)
    last_label_lod_rank_table = fluid.layers.control_flow.lod_rank_table(last_label_lod)
    image_array = lod_tensor_to_array(x = image_lod, table = image_lod_rank_table)
    label_array = lod_tensor_to_array(x = label_lod, table = label_lod_rank_table)
    last_label_array = lod_tensor_to_array(x = last_label_lod, table = last_label_lod_rank_table)
    loss_array = fluid.layers.create_array('float32')
    array_len = fluid.layers.fill_constant(
        shape = [1], dtype = 'int64', value = self.list_size)
    counter = fluid.layers.zeros(shape = [1], dtype = 'int64')
    cond = fluid.layers.less_than(x = counter, y = array_len)
    while_op = fluid.layers.While(cond = cond)
    with while_op.block():
        current_image = fluid.layers.array_read(array = image_array, i = counter)
        current_label = fluid.layers.array_read(array = label_array, i = counter)
        current_label_reshape = fluid.layers.reshape(x = current_label, shape = [-1, 1])
        current_last_label = fluid.layers.array_read(array = last_label_array, i = counter)
        current_last_label_reshape = fluid.layers.reshape(
            x = current_last_label, shape = [-1, 1])
        image_fc = fluid.layers.fc(input = current_image, size = 20)
        last_label_fc = fluid.layers.fc(input = current_last_label_reshape, size = 20)
        loss_input = fluid.layers.elementwise_add(
            x = image_fc, y = last_label_fc)
        current_loss = fluid.layers.softmax_with_cross_entropy(
            logits = loss_input, label = current_label_reshape)
        current_loss_val = fluid.layers.reduce_sum(current_loss, dim = 0)
        #loss_array = fluid.layers.array_write(current_loss_val, i = counter)
        fluid.layers.array_write(current_loss_val, array = loss_array, i = counter)
        fluid.layers.increment(x = counter, value = 1, in_place = True)
        fluid.layers.less_than(x = counter, y = array_len, cond = cond)
    loss_lod = array_to_lod_tensor(x = loss_array, table = image_lod_rank_table)
    print('%s' % loss_lod)
    loss = fluid.layers.reduce_sum(loss_lod, dim = 0)
    print('%s' % loss)
    return loss

def ntm_main():
    # 该模型运行在单个CPU上
    use_cuda = False # set to True if training with GPU
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    batch_size = 128
    list_size = 50
    train_generator = OmniglotGenerator(
        data_file='data/train.npz',
        nb_classes=5,
        nb_samples_per_class=10,
        batchsize=batch_size,
        max_iter=None,
        xp=np)
    test_generator = OmniglotGenerator(
        data_file='data/test.npz',
        nb_classes=5,
        nb_samples_per_class=10,
        batchsize=batch_size,
        max_iter=10,
        xp=np)
    ntm = Ntm(nb_class = 5, nb_reads = 4, input_size = 28 * 28, cell_size = 200,
        memory_shape = (128, 40), gamma = 0.95, batch_size = 128, list_size = 50)
    loss = ntm.train_program()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimizer.minimize(loss)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    main_program = fluid.default_main_program()
    test_program = fluid.default_main_program().clone(for_test=True)
    # create lod-tensor
    for i, (images, labels) in train_generator:
        images_list = []
        last_label_list = []
        label_list = []
        reshape_image_list = []
        reshape_last_label_list = []
        reshape_label_list = []
        for sample_idx in xrange(len(images)):
            # get list of arrys 128 * 784
            images_items = np.split(images[sample_idx], batch_size)
            images_list.append(images_items)
            label_list.append(list(labels[sample_idx]))
        last_label_list.append([0.0] * batch_size)
        for sample_idx in xrange(len(images) - 1):
            last_label_list.append(list(labels[sample_idx]))
        for batch_idx in xrange(batch_size):
            for sample_idx in xrange(len(images)):
                reshape_image_list.append(list(images_list[sample_idx][batch_idx]))
                reshape_last_label_list.append(
                    last_label_list[sample_idx][batch_idx])
                reshape_label_list.append(
                    label_list[sample_idx][batch_idx])
        image_lod = fluid.create_lod_tensor(
            np.array(reshape_image_list),
            [[50] * 128], place)
        label_lod = fluid.create_lod_tensor(
            np.array(reshape_label_list, dtype = 'int'),
            [[50] * 128], place)
        last_label_lod = fluid.create_lod_tensor(
            np.array(reshape_last_label_list, dtype = 'float32'),
            [[50] * 128], place)
        exe.run(
            main_program,
            feed = {
                'image_lod': image_lod,
                'label_lod': label_lod,
                'last_label_lod': last_label_lod},
            fetch_list = [loss])

上面两段是我的训练program的代码，主要是feed进去3个lod tensor，首先我将这3个lod tensor转成array，然后在while_op里面遍历每一个时间步的数据，然后搭model layers，之后得到每一个时间步的loss，将每一个时间步的loss用array write写到array里面。最后跳出while循环，将loss array转成lod tensor，并且用reduce sum求出所有时间步的loss总和返回给optimizer。在运行的时候报错：

Traceback (most recent call last):
  File "train_omniglot.py", line 213, in <module>
    ntm_main()
  File "train_omniglot.py", line 40, in ntm_main
    optimizer.minimize(loss)
  File "/home/ol/anaconda2/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 259, in minimize
    [error_clip_callback])
  File "/home/ol/anaconda2/lib/python2.7/site-packages/paddle/fluid/backward.py", line 590, in append_backward
    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
  File "/home/ol/anaconda2/lib/python2.7/site-packages/paddle/fluid/backward.py", line 412, in _append_backward_vars_
    _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
  File "/home/ol/anaconda2/lib/python2.7/site-packages/paddle/fluid/backward.py", line 426, in _append_backward_vars_
    op_desc.infer_shape(block.desc)
paddle.fluid.core.EnforceNotMet: Input(Out@GRAD) shouldn't be null. at [/paddle/paddle/fluid/operators/reshape_op.cc:314]
PaddlePaddle Call Stacks:
0       0x7f43d73dee36p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
1       0x7f43d8370e41p paddle::operators::Reshape2GradOp::InferShape(paddle::framework::InferShapeContext*) const + 913
2       0x7f43d7494f86p paddle::framework::OpDesc::InferShape(paddle::framework::BlockDesc const&) const + 886
3       0x7f43d7441275p void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::OpDesc, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::OpDesc::*)(paddle::framework::BlockDesc const&) const, pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&)#1}, void, paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::OpDesc, paddle::framework::BlockDesc const&, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::OpDesc::*)(paddle::framework::BlockDesc const&) const, pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&)#1}&&, void (*)(paddle::framework::OpDesc const*, paddle::framework::BlockDesc const&), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call) + 213
4       0x7f43d73f5544p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596
5       0x7f449e40feecp PyEval_EvalFrameEx + 33468
6       0x7f449e4114e9p PyEval_EvalCodeEx + 2025
7       0x7f449e40e482p PyEval_EvalFrameEx + 26706
8       0x7f449e4114e9p PyEval_EvalCodeEx + 2025
9       0x7f449e40e482p PyEval_EvalFrameEx + 26706
10      0x7f449e4114e9p PyEval_EvalCodeEx + 2025
11      0x7f449e40e482p PyEval_EvalFrameEx + 26706
12      0x7f449e4114e9p PyEval_EvalCodeEx + 2025
13      0x7f449e40e482p PyEval_EvalFrameEx + 26706
14      0x7f449e40fdacp PyEval_EvalFrameEx + 33148
15      0x7f449e4114e9p PyEval_EvalCodeEx + 2025
16      0x7f449e41170ap PyEval_EvalCode + 26
17      0x7f449e42a93dp
18      0x7f449e42bab8p PyRun_FileExFlags + 120
19      0x7f449e42ccd8p PyRun_SimpleFileExFlags + 232
20      0x7f449e43ed3cp Py_Main + 2988
21      0x7f449d677bd5p __libc_start_main + 245
22      0x7f449e70b87fp

不知道是什么原因，是说我传回去的loss是空的？

PaddlePaddle / Paddle 1 年多 前同步成功

while_op里的梯度回传问题

PaddlePaddle / Paddle
1 年多前同步成功