Errors: text classification program with multi-head-self-attention (#8999) · Issue · PaddlePaddle / Paddle

Errors: text classification program with multi-head-self-attention

Created by: gmcather

I am trying to write text classification with multi-head-self-attention. But the program threw out a bug, which is hard to detect. Please help, thank you!

Traceback (most recent call last):
  File "scripts/fluid_self_attention_sthread.py", line 94, in <module>
    attention_type='dot-product attention')
  File "/usr/local/lib/python2.7/dist-packages/paddle/trainer_config_helpers/default_decorators.py", line 53, in __wrapper__
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/paddle/trainer_config_helpers/networks.py", line 1639, in multi_head_attention
    query_proj += full_matrix_projection(query)
  File "/usr/local/lib/python2.7/dist-packages/paddle/trainer_config_helpers/layers.py", line 856, in __exit__
    **ExtraLayerAttribute.to_kwargs(self.layer_attr))
  File "/usr/local/lib/python2.7/dist-packages/paddle/trainer/config_parser.py", line 3452, in __init__
    name, 'mixed', size, inputs=inputs, **xargs)
  File "/usr/local/lib/python2.7/dist-packages/paddle/trainer/config_parser.py", line 1557, in __init__
    self.inputs = copy.deepcopy(inputs)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 230, in _deepcopy_list
    y.append(deepcopy(a, memo))
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 334, in _reconstruct
    state = deepcopy(state, memo)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 334, in _reconstruct
    state = deepcopy(state, memo)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 334, in _reconstruct
    state = deepcopy(state, memo)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 334, in _reconstruct
    state = deepcopy(state, memo)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 334, in _reconstruct
    state = deepcopy(state, memo)
  File "/usr/lib/python2.7/copy.py", line 163, in deepcopy
    y = copier(x, memo)
  File "/usr/lib/python2.7/copy.py", line 257, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/usr/lib/python2.7/copy.py", line 190, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/usr/lib/python2.7/copy.py", line 329, in _reconstruct
    y = callable(*args)
  File "/usr/lib/python2.7/copy_reg.py", line 93, in __newobj__
    return cls.__new__(cls, *args)
TypeError: pybind11_object_48.__new__(paddle.fluid.core.ProgramDesc) is not safe, use object.__new__()

Here is the code:

from __future__ import print_function
import numpy as np
import paddle.trainer_config_helpers.networks as paddle_network
import paddle.v2 as paddle
import paddle.fluid as fluid
import sys
import time

start_time = time.time()

def to_lodtensor(data, place):
    """
    load LODtensor
    """
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def load_vocab(filename):
    """
    load vocabulary
    """
    vocab = {}
    with open(filename) as f:
        wid = 0
        for line in f:
            vocab[line.strip()] = wid
            wid += 1
    return vocab

# load word dict with paddle inner function
word_dict = load_vocab(sys.argv[1])
word_dict["<unk>"] = len(word_dict)

dict_dim = len(word_dict)

# embedding dim
emb_dim = 128

# hidden dim
hid_dim = 128

# hidden dim2
hid_dim2 = 96

# class num
class_dim = 2

batch_size = 256

max_len = 4096

data = fluid.layers.data(name="words",
                        shape=[batch_size * max_len, 1],
                        dtype="int64",
                        append_batch_size=False)

label = fluid.layers.data(name="label", shape=[batch_size, 1], dtype="int64")

# embedding
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim], padding_idx=dict_dim)
enc_input = fluid.layers.reshape(x=emb, shape=[batch_size, -1, emb_dim])
print(enc_input)

queries = fluid.layers.fc(input=enc_input, size=hid_dim)
keys = fluid.layers.fc(input=enc_input, size=hid_dim)
print(queries)
print(keys)

#keys = fluid.layers.reshape(emb, shape = [256, -1, emb_dim])
"""
attention_out = fluid.nets.scaled_dot_product_attention(enc_input,
                                                        enc_input,
                                                        enc_input,
                                                        num_heads=1,
                                                        dropout_rate=0.0)
"""
attention_out = paddle_network.multi_head_attention(query=queries,
                                                    key=keys,
                                                    value=keys,
                                                    key_proj_size=64,
                                                    value_proj_size=64,
                                                    head_num=8,
                                                    attention_type='dot-product attention')

fc_1 = fluid.layers.fc(input=attention_out,
                        size=hid_dim2)

# probability of each class
prediction = fluid.layers.fc(input=[fc_1],
                             size=class_dim,
                             act="softmax")

# cross entropy loss
cost = fluid.layers.cross_entropy(input=prediction, label=label)

# mean loss
avg_cost = fluid.layers.mean(x=cost)

# SGD optimizer
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)

# accuracy metric
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
    test_target = accuracy.metrics + accuracy.states
    inference_program = fluid.io.get_inference_program(test_target)

# train data set
BATCH_SIZE = 256
train_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.imdb.train(word_dict), buf_size=50000),
    batch_size=BATCH_SIZE)
test_reader = paddle.batch(
    paddle.reader.shuffle(
        paddle.dataset.imdb.test(word_dict), buf_size=50000),
    batch_size=BATCH_SIZE)

# train in cpu
place = fluid.CPUPlace()
def test(exe):
    accuracy.reset(exe)
    for batch_id, data in enumerate(test_reader()):
        input_seq = to_lodtensor(map(lambda x:x[0], data), place)
        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
        y_data = y_data.reshape([-1, 1])
        acc = exe.run(inference_program,
                      feed={"words": input_seq,
                            "label": y_data})
    return accuracy.eval(exe)

# just like session in tensorflow
exe = fluid.Executor(place)

# like placeholder
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)

# not sure what's going on here
exe.run(fluid.default_startup_program())

# loop for 30 epochs, print acc ever 1000 batch
PASS_NUM = 30
for pass_id in xrange(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
        cost_val, acc_val = exe.run(fluid.default_main_program(),
                                    feed=feeder.feed(data),
                                    fetch_list=[avg_cost, accuracy.metrics[0]])
        pass_acc = accuracy.eval(exe)

    pass_test_acc = test(exe)
    print("test_acc: %f" % pass_test_acc)

end_time = time.time()
print(end_time - start_time)

PaddlePaddle / Paddle 大约 1 年 前同步成功

Errors: text classification program with multi-head-self-attention

PaddlePaddle / Paddle
大约 1 年前同步成功