diff --git a/PaddleNLP/PaddleTextGEN/seq2seq_rl/args.py b/PaddleNLP/PaddleTextGEN/seq2seq_rl/args.py deleted file mode 100644 index ee056e33597651f9e166e4d6399c89bfc36598f7..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleTextGEN/seq2seq_rl/args.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import distutils.util - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--train_data_prefix", type=str, help="file prefix for train data") - parser.add_argument( - "--eval_data_prefix", type=str, help="file prefix for eval data") - parser.add_argument( - "--test_data_prefix", type=str, help="file prefix for test data") - parser.add_argument( - "--vocab_prefix", type=str, help="file prefix for vocab") - parser.add_argument("--src_lang", type=str, help="source language suffix") - parser.add_argument("--tar_lang", type=str, help="target language suffix") - - parser.add_argument( - "--attention", - type=eval, - default=False, - help="Whether use attention model") - - parser.add_argument( - "--optimizer", - type=str, - default='adam', - help="optimizer to use, only supprt[sgd|adam]") - - parser.add_argument( - "--learning_rate", - type=float, - default=0.001, - help="learning rate for optimizer") - - parser.add_argument( - "--num_layers", - type=int, - default=1, - help="layers number of encoder and decoder") - parser.add_argument( - "--hidden_size", - type=int, - default=100, - help="hidden size of encoder and decoder") - parser.add_argument("--src_vocab_size", type=int, help="source vocab size") - parser.add_argument("--tar_vocab_size", type=int, help="target vocab size") - - parser.add_argument( - "--batch_size", type=int, help="batch size of each step") - - parser.add_argument( - "--max_epoch", type=int, default=12, help="max epoch for the training") - - parser.add_argument( - "--max_len", - type=int, - default=50, - help="max length for source and target sentence") - parser.add_argument( - "--dropout", type=float, default=0.0, help="drop probability") - parser.add_argument( - "--init_scale", - type=float, - default=0.0, - help="init scale for parameter") - parser.add_argument( - "--max_grad_norm", - type=float, - default=5.0, - help="max grad norm for global norm clip") - - parser.add_argument( - "--model_path", - type=str, - default='model', - help="model path for model to save") - - parser.add_argument( - "--reload_model", type=str, help="reload model to inference") - - parser.add_argument( - "--infer_file", type=str, help="file name for inference") - parser.add_argument( - "--infer_output_file", - type=str, - default='infer_output', - help="file name for inference output") - parser.add_argument( - "--beam_size", type=int, default=10, help="file name for inference") - - parser.add_argument( - '--use_gpu', - type=eval, - default=False, - help='Whether using gpu [True|False]') - - parser.add_argument( - "--enable_ce", - action='store_true', - help="The flag indicating whether to run the task " - "for continuous evaluation.") - - parser.add_argument( - "--profile", action='store_true', help="Whether enable the profile.") - - args = parser.parse_args() - return args diff --git a/PaddleNLP/PaddleTextGEN/seq2seq_rl/main.py b/PaddleNLP/PaddleTextGEN/seq2seq_rl/main.py deleted file mode 100644 index 96dcb2ee6c7bc112e3d45f6088e11598241ffc44..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleTextGEN/seq2seq_rl/main.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np -import paddle.fluid as fluid -from pg_agent import SeqPGAgent -from model import Seq2SeqModel, PolicyGradient, reward_func -import reader -from args import parse_args - -def main(): - args = parse_args() - num_layers = args.num_layers - hidden_size = args.hidden_size - dropout_prob = args.dropout - src_vocab_size = args.src_vocab_size - trg_vocab_size = args.tar_vocab_size - batch_size = args.batch_size - lr = args.learning_rate - train_data_prefix = args.train_data_prefix - eval_data_prefix = args.eval_data_prefix - test_data_prefix = args.test_data_prefix - vocab_prefix = args.vocab_prefix - src_lang = args.src_lang - tar_lang = args.tar_lang - print("begin to load data") - raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, - train_data_prefix, eval_data_prefix, - test_data_prefix, args.max_len) - print("finished load data") - train_data, valid_data, test_data, _ = raw_data - - def prepare_input(batch, epoch_id=0, teacher_forcing=False): - src_ids, src_mask, tar_ids, tar_mask = batch - res = {} - src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) - in_tar = tar_ids[:, :-1] - label_tar = tar_ids[:, 1:] - - in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) - label_tar = label_tar.reshape( - (label_tar.shape[0], label_tar.shape[1], 1)) - - res['src'] = src_ids - res['src_sequence_length'] = src_mask - if teacher_forcing: - res['tar'] = in_tar - res['tar_sequence_length'] = tar_mask - res['label'] = label_tar - - return res, np.sum(tar_mask) - - place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - - ### TODO: pretrain the model with teacher-forcing MLE - - ### fine-tune with policy gradient - agent = SeqPGAgent(model_cls=Seq2SeqModel, - alg_cls=PolicyGradient, - reward_func=reward_func, - model_hparams={ - "num_layers": num_layers, - "hidden_size": hidden_size, - "dropout_prob": dropout_prob, - "src_vocab_size": src_vocab_size, - "trg_vocab_size": trg_vocab_size, - "bos_token": 1, - "eos_token": 2, - }, - alg_hparams={"lr": lr}, - executor=exe) - - exe.run(fluid.default_startup_program()) - if args.reload_model: # load MLE pre-trained model - fluid.io.load_params(exe, - dirname=args.reload_model, - main_program=agent.full_program) - - max_epoch = args.max_epoch - for epoch_id in range(max_epoch): - if args.enable_ce: - train_data_iter = reader.get_data_iter( - train_data, batch_size, enable_ce=True) - else: - train_data_iter = reader.get_data_iter(train_data, batch_size) - for batch_id, batch in enumerate(train_data_iter): - input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) - reward, cost = agent.learn(input_data_feed) - print("epoch_id: %d, batch_id: %d, reward: %f, cost: %f" % - (epoch_id, batch_id, reward.mean(), cost)) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/PaddleNLP/PaddleTextGEN/seq2seq_rl/model.py b/PaddleNLP/PaddleTextGEN/seq2seq_rl/model.py deleted file mode 100644 index 7484bbe95309abf36fb5b43335b3f8bfd7fcc7cd..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleTextGEN/seq2seq_rl/model.py +++ /dev/null @@ -1,281 +0,0 @@ -import numpy as np -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -class EncoderCell(layers.RNNCell): - def __init__(self, num_layers, hidden_size, dropout_prob=0.): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [ - layers.LSTMCell(hidden_size) for i in range(num_layers) - ] - - def call(self, step_input, states): - new_states = [] - for i in range(self.num_layers): - out, new_state = self.lstm_cells[i](step_input, states[i]) - step_input = layers.dropout( - out, self.dropout_prob) if self.dropout_prob > 0 else out - new_states.append(new_state) - return step_input, new_states - - @property - def state_shape(self): - return [cell.state_shape for cell in self.lstm_cells] - - -class DecoderCell(layers.RNNCell): - def __init__(self, num_layers, hidden_size, dropout_prob=0.): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [ - layers.LSTMCell(hidden_size) for i in range(num_layers) - ] - - def attention(self, hidden, encoder_output, encoder_padding_mask): - query = layers.fc(hidden, - size=encoder_output.shape[-1], - bias_attr=False) - attn_scores = layers.matmul(layers.unsqueeze(query, [1]), - encoder_output, - transpose_y=True) - if encoder_padding_mask is not None: - attn_scores = layers.elementwise_add(attn_scores, - encoder_padding_mask) - attn_scores = layers.softmax(attn_scores) - attn_out = layers.squeeze(layers.matmul(attn_scores, encoder_output), - [1]) - attn_out = layers.concat([attn_out, hidden], 1) - attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False) - return attn_out - - def call(self, - step_input, - states, - encoder_output, - encoder_padding_mask=None): - lstm_states, input_feed = states - new_lstm_states = [] - step_input = layers.concat([step_input, input_feed], 1) - for i in range(self.num_layers): - out, new_lstm_state = self.lstm_cells[i](step_input, lstm_states[i]) - step_input = layers.dropout( - out, self.dropout_prob) if self.dropout_prob > 0 else out - new_lstm_states.append(new_lstm_state) - out = self.attention(step_input, encoder_output, encoder_padding_mask) - return out, [new_lstm_states, out] - - -class Encoder(object): - def __init__(self,num_layers, hidden_size, dropout_prob=0.): - self.encoder_cell = EncoderCell(num_layers, hidden_size, dropout_prob) - - def __call__(self, src_emb, src_sequence_length): - encoder_output, encoder_final_state = layers.rnn( - cell=self.encoder_cell, - inputs=src_emb, - sequence_length=src_sequence_length, - is_reverse=False) - return encoder_output, encoder_final_state - - -class Decoder(object): - def __init__(self, - num_layers, - hidden_size, - dropout_prob, - decoding_strategy="infer_sample", - max_decoding_length=100): - self.decoder_cell = DecoderCell(num_layers, hidden_size, dropout_prob) - self.decoding_strategy = decoding_strategy - self.max_decoding_length = None if ( - self.decoding_strategy == "train_greedy") else max_decoding_length - - def __call__(self, decoder_initial_states, encoder_output, - encoder_padding_mask, **kwargs): - output_layer = kwargs.pop("output_layer", None) - if self.decoding_strategy == "train_greedy": - # for teach-forcing MLE pre-training - helper = layers.TrainingHelper(**kwargs) - elif self.decoding_strategy == "infer_sample": - helper = layers.SampleEmbeddingHelper(**kwargs) - elif self.decoding_strategy == "infer_greedy": - helper = layers.GreedyEmbeddingHelper(**kwargs) - else: - # TODO: Add beam_search training support. - raise ValueError("Unknown decoding strategy: {}".format( - self.decoding_strategy)) - decoder = layers.BasicDecoder(self.decoder_cell, - helper, - output_fn=output_layer) - (decoder_output, decoder_final_state, - dec_seq_lengths) = layers.dynamic_decode( - decoder, - inits=decoder_initial_states, - max_step_num=self.max_decoding_length, - encoder_output=encoder_output, - encoder_padding_mask=encoder_padding_mask) - return decoder_output, decoder_final_state, dec_seq_lengths - - -class Seq2SeqModel(object): - def __init__(self, - num_layers, - hidden_size, - dropout_prob, - src_vocab_size, - trg_vocab_size, - bos_token, - eos_token, - decoding_strategy="infer_sample", - max_decoding_length=100): - self.bos_token, self.eos_token = bos_token, eos_token - self.src_embeder = lambda x: fluid.embedding( - input=x, - size=[src_vocab_size, hidden_size], - dtype="float32", - param_attr=fluid.ParamAttr(name="source_embedding")) - self.trg_embeder = lambda x: fluid.embedding( - input=x, - size=[trg_vocab_size, hidden_size], - dtype="float32", - param_attr=fluid.ParamAttr(name="target_embedding")) - self.encoder = Encoder(num_layers, hidden_size, dropout_prob) - self.decoder = Decoder(num_layers, hidden_size, dropout_prob, - decoding_strategy, max_decoding_length) - self.output_layer = lambda x: layers.fc( - x, - size=trg_vocab_size, - num_flatten_dims=len(x.shape) - 1, - param_attr=fluid.ParamAttr(name="output_w"), - bias_attr=False) - - def __call__(self, src, src_length, trg=None, trg_length=None): - # encoder - encoder_output, encoder_final_state = self.encoder( - self.src_embeder(src), src_length) - - decoder_initial_states = [ - encoder_final_state, - self.decoder.decoder_cell.get_initial_states( - batch_ref=encoder_output, shape=[encoder_output.shape[-1]]) - ] - src_mask = layers.sequence_mask(src_length, - maxlen=layers.shape(src)[1], - dtype="float32") - encoder_padding_mask = (src_mask - 1.0) * 1e9 - encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) - - # decoder - decoder_kwargs = { - "inputs": self.trg_embeder(trg), - "sequence_length": trg_length, - } if self.decoder.decoding_strategy == "train_greedy" else { - "embedding_fn": - self.trg_embeder, - "start_tokens": - layers.fill_constant_batch_size_like(input=encoder_output, - shape=[-1], - dtype=src.dtype, - value=self.bos_token), - "end_token": - self.eos_token - } - decoder_kwargs["output_layer"] = self.output_layer - - (decoder_output, decoder_final_state, - dec_seq_lengths) = self.decoder(decoder_initial_states, encoder_output, - encoder_padding_mask, **decoder_kwargs) - logits, samples, sample_length = (decoder_output.cell_outputs, - decoder_output.sample_ids, - dec_seq_lengths) - probs = layers.softmax(logits) - return probs, samples, sample_length - - -class PolicyGradient(object): - def __init__(self, model, lr=None): - self.model = model - self.lr = lr - - def predict(self, src, src_length): - return self.model(src, src_length) - - def learn(self, act_prob, action, reward, length=None): - """ - update policy model self.model with policy gradient algorithm - """ - neg_log_prob = layers.cross_entropy(act_prob, action) - cost = neg_log_prob * reward - cost = (layers.reduce_sum(cost) / layers.reduce_sum(length) - ) if length is not None else layers.reduce_mean(cost) - optimizer = fluid.optimizer.Adam(self.lr) - optimizer.minimize(cost) - return cost - - -def reward_func(samples, sample_length): - samples = np.array(samples) - sample_length = np.array(sample_length) - reward = (10 - np.abs(sample_length - 10)).astype("float32") - return discount_reward(reward, sample_length, discount=1.).astype("float32") - - -def discount_reward(reward, sequence_length, discount=1.): - return discount_reward_1d(reward, sequence_length, discount) - - -def discount_reward_1d(reward, sequence_length, discount=1., dtype=None): - if sequence_length is None: - raise ValueError('sequence_length must not be `None` for 1D reward.') - - reward = np.array(reward) - sequence_length = np.array(sequence_length) - - batch_size = reward.shape[0] - max_seq_length = np.max(sequence_length) - dtype = dtype or reward.dtype - - if discount == 1.: - dmat = np.ones([batch_size, max_seq_length], dtype=dtype) - else: - steps = np.tile(np.arange(max_seq_length), [batch_size, 1]) - mask = np.asarray(steps < (sequence_length - 1)[:, None], dtype=dtype) - # Make each row = [discount, ..., discount, 1, ..., 1] - dmat = mask * discount + (1 - mask) - dmat = np.cumprod(dmat[:, ::-1], axis=1)[:, ::-1] - - disc_reward = dmat * reward[:, None] - disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype) - - return disc_reward - - -def mask_sequences(sequence, sequence_length, dtype=None, time_major=False): - sequence = np.array(sequence) - sequence_length = np.array(sequence_length) - - rank = sequence.ndim - if rank < 2: - raise ValueError("`sequence` must be 2D or higher order.") - batch_size = sequence.shape[0] - max_time = sequence.shape[1] - dtype = dtype or sequence.dtype - - if time_major: - sequence = np.transpose(sequence, axes=[1, 0, 2]) - - steps = np.tile(np.arange(max_time), [batch_size, 1]) - mask = np.asarray(steps < sequence_length[:, None], dtype=dtype) - for _ in range(2, rank): - mask = np.expand_dims(mask, -1) - - sequence = sequence * mask - - if time_major: - sequence = np.transpose(sequence, axes=[1, 0, 2]) - - return sequence \ No newline at end of file diff --git a/PaddleNLP/PaddleTextGEN/seq2seq_rl/pg_agent.py b/PaddleNLP/PaddleTextGEN/seq2seq_rl/pg_agent.py deleted file mode 100644 index 97c4832c0515b58bb719abe25c69b2295b922482..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleTextGEN/seq2seq_rl/pg_agent.py +++ /dev/null @@ -1,58 +0,0 @@ -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from model import PolicyGradient - -class SeqPGAgent(object): - def __init__(self, - model_cls, - reward_func, - alg_cls=PolicyGradient, - model_hparams={}, - alg_hparams={}, - executor=None): - self.build_program(model_cls, reward_func, alg_cls, model_hparams, - alg_hparams) - self.executor = executor - - def build_program(self, model_cls, reward_func, alg_cls, model_hparams, - alg_hparams): - self.full_program = fluid.Program() - with fluid.program_guard(self.full_program, - fluid.default_startup_program()): - source = fluid.data(name="src", shape=[None, None], dtype="int64") - source_length = fluid.data(name="src_sequence_length", - shape=[None], - dtype="int64") - self.alg = alg_cls(model=model_cls(**model_hparams), **alg_hparams) - self.probs, self.samples, self.sample_length = self.alg.predict( - source, source_length) - self.samples.stop_gradient = True - reward = fluid.layers.create_global_var( - name="reward", - shape=[-1, -1], # batch_size, seq_len - value="0", - dtype=self.probs.dtype) - self.reward = fluid.layers.py_func( - func=reward_func, - x=[self.samples, self.sample_length], - out=reward) - self.cost = self.alg.learn(self.probs, self.samples, self.reward, - self.sample_length) - - # to define the same parameters between different programs - self.pred_program = self.full_program._prune_with_input( - [source.name, source_length.name], - [self.probs, self.samples, self.sample_length]) - - def predict(self, feed_dict): - samples, sample_length = self.executor.run( - self.pred_program, - feed=feed_dict, - fetch_list=[self.samples, self.sample_length]) - return samples, sample_length - - def learn(self, feed_dict): - reward, cost = self.executor.run(self.full_program, - feed=feed_dict, - fetch_list=[self.reward, self.cost]) - return reward, cost diff --git a/PaddleNLP/PaddleTextGEN/seq2seq_rl/reader.py b/PaddleNLP/PaddleTextGEN/seq2seq_rl/reader.py deleted file mode 100644 index 18f17067236b3578c6eb8a17e5ab87ec74e320d6..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleTextGEN/seq2seq_rl/reader.py +++ /dev/null @@ -1,216 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utilities for parsing PTB text files.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import os -import io -import sys -import numpy as np - -Py3 = sys.version_info[0] == 3 - -UNK_ID = 0 - - -def _read_words(filename): - data = [] - with io.open(filename, "r", encoding='utf-8') as f: - if Py3: - return f.read().replace("\n", "").split() - else: - return f.read().decode("utf-8").replace(u"\n", u"").split() - - -def read_all_line(filename): - data = [] - with io.open(filename, "r", encoding='utf-8') as f: - for line in f.readlines(): - data.append(line.strip()) - - -def _build_vocab(filename): - - vocab_dict = {} - ids = 0 - with io.open(filename, "r", encoding='utf-8') as f: - for line in f.readlines(): - vocab_dict[line.strip()] = ids - ids += 1 - - print("vocab word num", ids) - - return vocab_dict - - -def _para_file_to_ids(src_file, tar_file, src_vocab, tar_vocab): - - src_data = [] - with io.open(src_file, "r", encoding='utf-8') as f_src: - for line in f_src.readlines(): - arra = line.strip().split() - ids = [src_vocab[w] if w in src_vocab else UNK_ID for w in arra] - ids = ids - - src_data.append(ids) - - tar_data = [] - with io.open(tar_file, "r", encoding='utf-8') as f_tar: - for line in f_tar.readlines(): - arra = line.strip().split() - ids = [tar_vocab[w] if w in tar_vocab else UNK_ID for w in arra] - - ids = [1] + ids + [2] - - tar_data.append(ids) - - return src_data, tar_data - - -def filter_len(src, tar, max_sequence_len=50): - new_src = [] - new_tar = [] - - for id1, id2 in zip(src, tar): - if len(id1) > max_sequence_len: - id1 = id1[:max_sequence_len] - if len(id2) > max_sequence_len + 2: - id2 = id2[:max_sequence_len + 2] - - new_src.append(id1) - new_tar.append(id2) - - return new_src, new_tar - - -def raw_data(src_lang, - tar_lang, - vocab_prefix, - train_prefix, - eval_prefix, - test_prefix, - max_sequence_len=50): - - src_vocab_file = vocab_prefix + "." + src_lang - tar_vocab_file = vocab_prefix + "." + tar_lang - - src_train_file = train_prefix + "." + src_lang - tar_train_file = train_prefix + "." + tar_lang - - src_eval_file = eval_prefix + "." + src_lang - tar_eval_file = eval_prefix + "." + tar_lang - - src_test_file = test_prefix + "." + src_lang - tar_test_file = test_prefix + "." + tar_lang - - src_vocab = _build_vocab(src_vocab_file) - tar_vocab = _build_vocab(tar_vocab_file) - - train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \ - src_vocab, tar_vocab ) - train_src, train_tar = filter_len(train_src, - train_tar, - max_sequence_len=max_sequence_len) - eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \ - src_vocab, tar_vocab ) - - test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \ - src_vocab, tar_vocab ) - - return ( train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\ - (src_vocab, tar_vocab) - - -def raw_mono_data(vocab_file, file_path): - - src_vocab = _build_vocab(vocab_file) - - test_src, test_tar = _para_file_to_ids( file_path, file_path, \ - src_vocab, src_vocab ) - - return (test_src, test_tar) - - -def get_data_iter(raw_data, - batch_size, - mode='train', - enable_ce=False, - cache_num=20): - - src_data, tar_data = raw_data - - data_len = len(src_data) - - index = np.arange(data_len) - if mode == "train" and not enable_ce: - np.random.shuffle(index) - - def to_pad_np(data, source=False): - max_len = 0 - for ele in data: - if len(ele) > max_len: - max_len = len(ele) - batch_size = len(data) - ids = np.ones((batch_size, max_len), dtype='int64') * 2 - mask = np.zeros((batch_size), dtype='int64') - - for i, ele in enumerate(data): - ids[i, :len(ele)] = ele - if not source: - mask[i] = len(ele) - 1 - else: - mask[i] = len(ele) - - return ids, mask - - b_src = [] - - if mode != "train": - cache_num = 1 - for j in range(data_len): - if len(b_src) == batch_size * cache_num: - # build batch size - - # sort - new_cache = sorted( - b_src, key=lambda k: len(k[0])) if mode == "train" else b_src - - for i in range(cache_num): - batch_data = new_cache[i * batch_size:(i + 1) * batch_size] - src_cache = [w[0] for w in batch_data] - tar_cache = [w[1] for w in batch_data] - src_ids, src_mask = to_pad_np(src_cache, source=True) - tar_ids, tar_mask = to_pad_np(tar_cache) - yield (src_ids, src_mask, tar_ids, tar_mask) - - b_src = [] - - b_src.append((src_data[index[j]], tar_data[index[j]])) - - new_cache = sorted(b_src, - key=lambda k: len(k[0])) if mode == "train" else b_src - for i in range(cache_num): - batch_data = new_cache[i * batch_size:(i + 1) * batch_size] - if len(batch_data) < batch_size: - if mode == "train" or len(batch_data) == 0: - continue - src_cache = [w[0] for w in batch_data] - tar_cache = [w[1] for w in batch_data] - src_ids, src_mask = to_pad_np(src_cache, source=True) - tar_ids, tar_mask = to_pad_np(tar_cache) - yield (src_ids, src_mask, tar_ids, tar_mask)