提交 f91528a9 编写于 作者: G guosheng

Add seq2seq

上级 7d1ea67d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import distutils.util
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--train_data_prefix", type=str, help="file prefix for train data")
parser.add_argument(
"--eval_data_prefix", type=str, help="file prefix for eval data")
parser.add_argument(
"--test_data_prefix", type=str, help="file prefix for test data")
parser.add_argument(
"--vocab_prefix", type=str, help="file prefix for vocab")
parser.add_argument("--src_lang", type=str, help="source language suffix")
parser.add_argument("--tar_lang", type=str, help="target language suffix")
parser.add_argument(
"--attention",
type=eval,
default=False,
help="Whether use attention model")
parser.add_argument(
"--optimizer",
type=str,
default='adam',
help="optimizer to use, only supprt[sgd|adam]")
parser.add_argument(
"--learning_rate",
type=float,
default=0.001,
help="learning rate for optimizer")
parser.add_argument(
"--num_layers",
type=int,
default=1,
help="layers number of encoder and decoder")
parser.add_argument(
"--hidden_size",
type=int,
default=100,
help="hidden size of encoder and decoder")
parser.add_argument("--src_vocab_size", type=int, help="source vocab size")
parser.add_argument("--tar_vocab_size", type=int, help="target vocab size")
parser.add_argument(
"--batch_size", type=int, help="batch size of each step")
parser.add_argument(
"--max_epoch", type=int, default=12, help="max epoch for the training")
parser.add_argument(
"--max_len",
type=int,
default=50,
help="max length for source and target sentence")
parser.add_argument(
"--dropout", type=float, default=0.0, help="drop probability")
parser.add_argument(
"--init_scale",
type=float,
default=0.0,
help="init scale for parameter")
parser.add_argument(
"--max_grad_norm",
type=float,
default=5.0,
help="max grad norm for global norm clip")
parser.add_argument(
"--model_path",
type=str,
default='model',
help="model path for model to save")
parser.add_argument(
"--reload_model", type=str, help="reload model to inference")
parser.add_argument(
"--infer_file", type=str, help="file name for inference")
parser.add_argument(
"--infer_output_file",
type=str,
default='infer_output',
help="file name for inference output")
parser.add_argument(
"--beam_size", type=int, default=10, help="file name for inference")
parser.add_argument(
'--use_gpu',
type=eval,
default=False,
help='Whether using gpu [True|False]')
parser.add_argument(
'--eager_run', type=eval, default=False, help='Whether to use dygraph')
parser.add_argument(
"--enable_ce",
action='store_true',
help="The flag indicating whether to run the task "
"for continuous evaluation.")
parser.add_argument(
"--profile", action='store_true', help="Whether enable the profile.")
# NOTE: profiler args, used for benchmark
parser.add_argument(
"--profiler_path",
type=str,
default='./seq2seq.profile',
help="the profiler output file path. (used for benchmark)")
args = parser.parse_args()
return args
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Script for downloading training data.
'''
import os
import urllib
import sys
if sys.version_info >= (3, 0):
import urllib.request
import zipfile
URLLIB = urllib
if sys.version_info >= (3, 0):
URLLIB = urllib.request
remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi'
base_path = 'data'
tar_path = os.path.join(base_path, 'en-vi')
filenames = [
'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en',
'tst2013.vi', 'vocab.en', 'vocab.vi'
]
def main(arguments):
print("Downloading data......")
if not os.path.exists(tar_path):
if not os.path.exists(base_path):
os.mkdir(base_path)
os.mkdir(tar_path)
for filename in filenames:
url = remote_path + '/' + filename
tar_file = os.path.join(tar_path, filename)
URLLIB.urlretrieve(url, tar_file)
print("Downloaded sucess......")
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import io
import sys
import numpy as np
Py3 = sys.version_info[0] == 3
UNK_ID = 0
def _read_words(filename):
data = []
with io.open(filename, "r", encoding='utf-8') as f:
if Py3:
return f.read().replace("\n", "<eos>").split()
else:
return f.read().decode("utf-8").replace(u"\n", u"<eos>").split()
def read_all_line(filenam):
data = []
with io.open(filename, "r", encoding='utf-8') as f:
for line in f.readlines():
data.append(line.strip())
def _build_vocab(filename):
vocab_dict = {}
ids = 0
with io.open(filename, "r", encoding='utf-8') as f:
for line in f.readlines():
vocab_dict[line.strip()] = ids
ids += 1
print("vocab word num", ids)
return vocab_dict
def _para_file_to_ids(src_file, tar_file, src_vocab, tar_vocab):
src_data = []
with io.open(src_file, "r", encoding='utf-8') as f_src:
for line in f_src.readlines():
arra = line.strip().split()
ids = [src_vocab[w] if w in src_vocab else UNK_ID for w in arra]
ids = ids
src_data.append(ids)
tar_data = []
with io.open(tar_file, "r", encoding='utf-8') as f_tar:
for line in f_tar.readlines():
arra = line.strip().split()
ids = [tar_vocab[w] if w in tar_vocab else UNK_ID for w in arra]
ids = [1] + ids + [2]
tar_data.append(ids)
return src_data, tar_data
def filter_len(src, tar, max_sequence_len=50):
new_src = []
new_tar = []
for id1, id2 in zip(src, tar):
if len(id1) > max_sequence_len:
id1 = id1[:max_sequence_len]
if len(id2) > max_sequence_len + 2:
id2 = id2[:max_sequence_len + 2]
new_src.append(id1)
new_tar.append(id2)
return new_src, new_tar
def raw_data(src_lang,
tar_lang,
vocab_prefix,
train_prefix,
eval_prefix,
test_prefix,
max_sequence_len=50):
src_vocab_file = vocab_prefix + "." + src_lang
tar_vocab_file = vocab_prefix + "." + tar_lang
src_train_file = train_prefix + "." + src_lang
tar_train_file = train_prefix + "." + tar_lang
src_eval_file = eval_prefix + "." + src_lang
tar_eval_file = eval_prefix + "." + tar_lang
src_test_file = test_prefix + "." + src_lang
tar_test_file = test_prefix + "." + tar_lang
src_vocab = _build_vocab(src_vocab_file)
tar_vocab = _build_vocab(tar_vocab_file)
train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \
src_vocab, tar_vocab )
train_src, train_tar = filter_len(
train_src, train_tar, max_sequence_len=max_sequence_len)
eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \
src_vocab, tar_vocab )
test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \
src_vocab, tar_vocab )
return ( train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\
(src_vocab, tar_vocab)
def raw_mono_data(vocab_file, file_path):
src_vocab = _build_vocab(vocab_file)
test_src, test_tar = _para_file_to_ids( file_path, file_path, \
src_vocab, src_vocab )
return (test_src, test_tar)
def get_data_iter(raw_data,
batch_size,
mode='train',
enable_ce=False,
cache_num=20):
src_data, tar_data = raw_data
data_len = len(src_data)
index = np.arange(data_len)
if mode == "train" and not enable_ce:
np.random.shuffle(index)
def to_pad_np(data, source=False):
max_len = 0
bs = min(batch_size, len(data))
for ele in data:
if len(ele) > max_len:
max_len = len(ele)
ids = np.ones((bs, max_len), dtype='int64') * 2
mask = np.zeros((bs), dtype='int32')
for i, ele in enumerate(data):
ids[i, :len(ele)] = ele
if not source:
mask[i] = len(ele) - 1
else:
mask[i] = len(ele)
return ids, mask
b_src = []
if mode != "train":
cache_num = 1
for j in range(data_len):
if len(b_src) == batch_size * cache_num:
# build batch size
# sort
if mode == 'infer':
new_cache = b_src
else:
new_cache = sorted(b_src, key=lambda k: len(k[0]))
for i in range(cache_num):
batch_data = new_cache[i * batch_size:(i + 1) * batch_size]
src_cache = [w[0] for w in batch_data]
tar_cache = [w[1] for w in batch_data]
src_ids, src_mask = to_pad_np(src_cache, source=True)
tar_ids, tar_mask = to_pad_np(tar_cache)
yield (src_ids, src_mask, tar_ids, tar_mask)
b_src = []
b_src.append((src_data[index[j]], tar_data[index[j]]))
if len(b_src) == batch_size * cache_num or mode == 'infer':
if mode == 'infer':
new_cache = b_src
else:
new_cache = sorted(b_src, key=lambda k: len(k[0]))
for i in range(cache_num):
batch_end = min(len(new_cache), (i + 1) * batch_size)
batch_data = new_cache[i * batch_size:batch_end]
src_cache = [w[0] for w in batch_data]
tar_cache = [w[1] for w in batch_data]
src_ids, src_mask = to_pad_np(src_cache, source=True)
tar_ids, tar_mask = to_pad_np(tar_cache)
yield (src_ids, src_mask, tar_ids, tar_mask)
python train.py \
--src_lang en --tar_lang vi \
--attention True \
--num_layers 2 \
--hidden_size 512 \
--src_vocab_size 17191 \
--tar_vocab_size 7709 \
--batch_size 128 \
--dropout 0.2 \
--init_scale 0.1 \
--max_grad_norm 5.0 \
--train_data_prefix data/en-vi/train \
--eval_data_prefix data/en-vi/tst2012 \
--test_data_prefix data/en-vi/tst2013 \
--vocab_prefix data/en-vi/vocab \
--use_gpu True \
--model_path attention_models
\ No newline at end of file
# used for continuous evaluation
enable_ce: False
eager_run: False
# The frequency to save trained models when training.
save_step: 10000
# The frequency to fetch and print output when training.
print_step: 100
# path of the checkpoint, to resume the previous training
init_from_checkpoint: ""
# path of the pretrain model, to better solve the current task
init_from_pretrain_model: ""
# path of trained parameter, to make prediction
init_from_params: "trained_params/step_100000/"
# the directory for saving model
save_model: "trained_models"
# the directory for saving inference model.
inference_model_dir: "infer_model"
# Set seed for CE or debug
random_seed: None
# The pattern to match training data files.
training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
# The pattern to match validation data files.
validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de"
# The pattern to match test data files.
predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
# The file to output the translation results of predict_file to.
output_file: "predict.txt"
# The path of vocabulary file of source language.
src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
# The path of vocabulary file of target language.
trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
# The <bos>, <eos> and <unk> tokens in the dictionary.
special_token: ["<s>", "<e>", "<unk>"]
# max length of sequences
max_length: 256
# whether to use cuda
use_cuda: True
# args for reader, see reader.py for details
token_delimiter: " "
use_token_batch: True
pool_size: 200000
sort_type: "pool"
shuffle: True
shuffle_batch: True
batch_size: 4096
# Hyparams for training:
# the number of epoches for training
epoch: 30
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate: 0.001
# Hyparams for generation:
# the parameters for beam search.
beam_size: 5
max_out_len: 256
# the number of decoded sentences to output.
n_best: 1
# Hyparams for model:
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size: 10000
# size of target word dictionay
trg_vocab_size: 10000
# index for <bos> token
bos_idx: 0
# index for <eos> token
eos_idx: 1
# index for <unk> token
unk_idx: 2
embed_dim: 512
hidden_size: 512
num_layers: 2
dropout: 0.1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid import ParamAttr
from paddle.fluid.initializer import UniformInitializer
from paddle.fluid.dygraph import Embedding, Linear, Layer
from paddle.fluid.layers import BeamSearchDecoder
from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell
from model import Model, Loss
from seq2seq_base import Encoder
class AttentionLayer(Layer):
def __init__(self, hidden_size, bias=False, init_scale=0.1):
super(AttentionLayer, self).__init__()
self.input_proj = Linear(
hidden_size,
hidden_size,
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)),
bias_attr=bias)
self.output_proj = Linear(
hidden_size + hidden_size,
hidden_size,
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)),
bias_attr=bias)
def forward(self, hidden, encoder_output, encoder_padding_mask):
query = self.input_proj(hidden)
attn_scores = layers.matmul(
layers.unsqueeze(query, [1]), encoder_output, transpose_y=True)
if encoder_padding_mask is not None:
attn_scores = layers.elementwise_add(attn_scores,
encoder_padding_mask)
attn_scores = layers.softmax(attn_scores)
attn_out = layers.squeeze(
layers.matmul(attn_scores, encoder_output), [1])
attn_out = layers.concat([attn_out, hidden], 1)
attn_out = self.output_proj(attn_out)
return attn_out
class DecoderCell(RNNCell):
def __init__(self,
num_layers,
input_size,
hidden_size,
dropout_prob=0.,
init_scale=0.1):
super(DecoderCell, self).__init__()
self.dropout_prob = dropout_prob
# use add_sublayer to add multi-layers
self.lstm_cells = []
for i in range(num_layers):
self.lstm_cells.append(
self.add_sublayer(
"lstm_%d" % i,
BasicLSTMCell(
input_size=input_size + hidden_size
if i == 0 else hidden_size,
hidden_size=hidden_size)))
self.attention_layer = AttentionLayer(hidden_size)
def forward(self,
step_input,
states,
encoder_output,
encoder_padding_mask=None):
lstm_states, input_feed = states
new_lstm_states = []
step_input = layers.concat([step_input, input_feed], 1)
for i, lstm_cell in enumerate(self.lstm_cells):
out, new_lstm_state = lstm_cell(step_input, lstm_states[i])
step_input = layers.dropout(
out, self.dropout_prob) if self.dropout_prob > 0 else out
new_lstm_states.append(new_lstm_state)
out = self.attention_layer(step_input, encoder_output,
encoder_padding_mask)
return out, [new_lstm_states, out]
class Decoder(Layer):
def __init__(self,
vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
init_scale=0.1):
super(Decoder, self).__init__()
self.embedder = Embedding(
size=[vocab_size, embed_dim],
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)))
self.lstm_attention = RNN(DecoderCell(num_layers, embed_dim,
hidden_size, init_scale),
is_reverse=False,
time_major=False)
self.output_layer = Linear(
hidden_size,
vocab_size,
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)),
bias_attr=False)
def forward(self, target, decoder_initial_states, encoder_output,
encoder_padding_mask):
inputs = self.embedder(target)
decoder_output, _ = self.lstm_attention(
inputs,
initial_states=decoder_initial_states,
encoder_output=encoder_output,
encoder_padding_mask=encoder_padding_mask)
predict = self.output_layer(decoder_output)
return predict
class AttentionModel(Model):
def __init__(self,
src_vocab_size,
trg_vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
init_scale=0.1):
super(AttentionModel, self).__init__()
self.hidden_size = hidden_size
self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
num_layers, dropout_prob, init_scale)
self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
num_layers, dropout_prob, init_scale)
def forward(self, src, src_length, trg, trg_length):
# encoder
encoder_output, encoder_final_state = self.encoder(src, src_length)
# decoder initial states: use input_feed and the structure is
# [[h,c] * num_layers, input_feed], consistent with DecoderCell.states
decoder_initial_states = [
encoder_final_state,
self.decoder.lstm_attention.cell.get_initial_states(
batch_ref=encoder_output, shape=[self.hidden_size])
]
# attention mask to avoid paying attention on padddings
src_mask = layers.sequence_mask(
src_length,
maxlen=layers.shape(src)[1],
dtype=encoder_output.dtype)
encoder_padding_mask = (src_mask - 1.0) * 1e9
encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
# decoder with attentioon
predict = self.decoder(trg, decoder_initial_states, encoder_output,
encoder_padding_mask)
# for target padding mask
mask = layers.sequence_mask(
trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype)
return predict, mask
class AttentionInferModel(AttentionModel):
def __init__(self,
vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
bos_id=0,
eos_id=1,
beam_size=4,
max_out_len=256):
args = dict(locals())
args.pop("self")
args.pop("__class__", None) # py3
self.beam_size = args.pop("beam_size")
self.max_out_len = args.pop("max_out_len")
super(AttentionInferModel, self).__init__(**args)
# dynamic decoder for inference
decoder = BeamSearchDecoder(
self.decoder.lstm_attention.cell,
start_token=bos_id,
end_token=eos_id,
beam_size=beam_size,
embedding_fn=self.decoder.embedder,
output_fn=self.decoder.output_layer)
self.beam_search_decoder = DynamicDecode(
decoder, max_step_num=max_out_len, is_test=True)
def forward(self, src, src_length):
# encoding
encoder_output, encoder_final_state = self.encoder(src, src_length)
# decoder initial states
decoder_initial_states = [
encoder_final_state,
self.decoder.lstm_attention.cell.get_initial_states(
batch_ref=encoder_output, shape=[self.hidden_size])
]
# attention mask to avoid paying attention on padddings
src_mask = layers.sequence_mask(
src_length,
maxlen=layers.shape(src)[1],
dtype=encoder_output.dtype)
encoder_padding_mask = (src_mask - 1.0) * 1e9
encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
# Tile the batch dimension with beam_size
encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch(
encoder_output, self.beam_size)
encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch(
encoder_padding_mask, self.beam_size)
# dynamic decoding with beam search
rs, _ = self.beam_search_decoder(
inits=decoder_initial_states,
encoder_output=encoder_output,
encoder_padding_mask=encoder_padding_mask)
return rs
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid import ParamAttr
from paddle.fluid.initializer import UniformInitializer
from paddle.fluid.dygraph import Embedding, Linear, Layer
from paddle.fluid.layers import BeamSearchDecoder
from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell
from model import Model, Loss
class CrossEntropyCriterion(Loss):
def __init__(self):
super(CrossEntropyCriterion, self).__init__()
def forward(self, outputs, labels):
(predict, mask), label = outputs, labels[0]
cost = layers.softmax_with_cross_entropy(
logits=predict, label=label, soft_label=False)
masked_cost = layers.elementwise_mul(cost, mask, axis=0)
batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0])
seq_cost = layers.reduce_sum(batch_mean_cost)
return seq_cost
class EncoderCell(RNNCell):
def __init__(self,
num_layers,
input_size,
hidden_size,
dropout_prob=0.,
init_scale=0.1):
super(EncoderCell, self).__init__()
self.dropout_prob = dropout_prob
# use add_sublayer to add multi-layers
self.lstm_cells = []
for i in range(num_layers):
self.lstm_cells.append(
self.add_sublayer(
"lstm_%d" % i,
BasicLSTMCell(
input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size,
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)))))
def forward(self, step_input, states):
new_states = []
for i, lstm_cell in enumerate(self.lstm_cells):
out, new_state = lstm_cell(step_input, states[i])
step_input = layers.dropout(
out, self.dropout_prob) if self.dropout_prob > 0 else out
new_states.append(new_state)
return step_input, new_states
@property
def state_shape(self):
return [cell.state_shape for cell in self.lstm_cells]
class Encoder(Layer):
def __init__(self,
vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
init_scale=0.1):
super(Encoder, self).__init__()
self.embedder = Embedding(
size=[vocab_size, embed_dim],
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)))
self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size,
init_scale),
is_reverse=False,
time_major=False)
def forward(self, sequence, sequence_length):
inputs = self.embedder(sequence)
encoder_output, encoder_state = self.stack_lstm(
inputs, sequence_length=sequence_length)
return encoder_output, encoder_state
DecoderCell = EncoderCell
class Decoder(Layer):
def __init__(self,
vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
init_scale=0.1):
super(Decoder, self).__init__()
self.embedder = Embedding(
size=[vocab_size, embed_dim],
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)))
self.stack_lstm = RNN(DecoderCell(num_layers, embed_dim, hidden_size,
init_scale),
is_reverse=False,
time_major=False)
self.output_layer = Linear(
hidden_size,
vocab_size,
param_attr=ParamAttr(initializer=UniformInitializer(
low=-init_scale, high=init_scale)),
bias_attr=False)
def forward(self, target, decoder_initial_states):
inputs = self.embedder(target)
decoder_output, _ = self.stack_lstm(
inputs, initial_states=decoder_initial_states)
predict = self.output_layer(decoder_output)
return predict
class BaseModel(Model):
def __init__(self,
src_vocab_size,
trg_vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
init_scale=0.1):
super(BaseModel, self).__init__()
self.hidden_size = hidden_size
self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
num_layers, dropout_prob, init_scale)
self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
num_layers, dropout_prob, init_scale)
def forward(self, src, src_length, trg, trg_length):
# encoder
encoder_output, encoder_final_states = self.encoder(src, src_length)
# decoder
predict = self.decoder(trg, encoder_final_states)
# for target padding mask
mask = layers.sequence_mask(
trg_length, maxlen=layers.shape(trg)[1], dtype=predict.dtype)
return predict, mask
class BaseInferModel(BaseModel):
def __init__(self,
vocab_size,
embed_dim,
hidden_size,
num_layers,
dropout_prob=0.,
bos_id=0,
eos_id=1,
beam_size=4,
max_out_len=256):
args = dict(locals())
args.pop("self")
args.pop("__class__", None) # py3
self.beam_size = args.pop("beam_size")
self.max_out_len = args.pop("max_out_len")
super(BaseInferModel, self).__init__(**args)
# dynamic decoder for inference
decoder = BeamSearchDecoder(
self.decoder.stack_lstm.cell,
start_token=bos_id,
end_token=eos_id,
beam_size=beam_size,
embedding_fn=self.decoder.embedder,
output_fn=self.decoder.output_layer)
self.beam_search_decoder = DynamicDecode(
decoder, max_step_num=max_out_len, is_test=True)
def forward(self, src, src_length):
# encoding
encoder_output, encoder_final_states = self.encoder(src, src_length)
# dynamic decoding with beam search
rs, _ = self.beam_search_decoder(inits=encoder_final_states)
return rs
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import contextlib
from functools import partial
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable
from paddle.fluid.io import DataLoader
from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm
import reader
from args import parse_args
from seq2seq_base import BaseModel, CrossEntropyCriterion
from seq2seq_attn import AttentionModel
from model import Input, set_device
from callbacks import ProgBarLogger
from metrics import Metric
class PPL(Metric):
pass
def do_train(args):
device = set_device("gpu" if args.use_gpu else "cpu")
fluid.enable_dygraph(device) #if args.eager_run else None
# define model
inputs = [
Input(
[None, None], "int64", name="src_word"),
Input(
[None], "int64", name="src_length"),
Input(
[None, None], "int64", name="trg_word"),
Input(
[None], "int64", name="trg_length"),
]
labels = [Input([None, None, 1], "int64", name="label"), ]
model = AttentionModel(args.src_vocab_size, args.tar_vocab_size,
args.hidden_size, args.hidden_size, args.num_layers,
args.dropout)
model.prepare(
fluid.optimizer.Adam(
learning_rate=args.learning_rate,
parameter_list=model.parameters()),
CrossEntropyCriterion(),
inputs=inputs,
labels=labels)
batch_size = 32
src_seq_len = 10
trg_seq_len = 12
iter_num = 10
def random_generator():
for i in range(iter_num):
src = np.random.randint(2, args.src_vocab_size,
(batch_size, src_seq_len)).astype("int64")
src_length = np.random.randint(1, src_seq_len,
(batch_size, )).astype("int64")
trg = np.random.randint(2, args.tar_vocab_size,
(batch_size, trg_seq_len)).astype("int64")
trg_length = np.random.randint(1, trg_seq_len,
(batch_size, )).astype("int64")
label = np.random.randint(
1, trg_seq_len, (batch_size, trg_seq_len, 1)).astype("int64")
yield src, src_length, trg, trg_length, label
model.fit(train_data=random_generator, log_freq=1)
exit(0)
data_loaders = [None, None]
data_files = [args.training_file, args.validation_file
] if args.validation_file else [args.training_file]
train_loader, eval_loader = data_loaders
model.fit(train_data=train_loader,
eval_data=None,
epochs=1,
eval_freq=1,
save_freq=1,
verbose=2)
if __name__ == "__main__":
args = parse_args()
do_train(args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册