diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..808d4516c57593210988542150b74c671e41d5da
--- /dev/null
+++ b/examples/seq2seq/README.md
@@ -0,0 +1,165 @@
+运行本目录下的范例模型需要安装PaddlePaddle Fluid 1.7版。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。
+
+# Sequence to Sequence (Seq2Seq)
+
+以下是本范例模型的简要目录结构及说明:
+
+```
+.
+├── README.md # 文档,本文件
+├── args.py # 训练、预测以及模型参数配置程序
+├── reader.py # 数据读入程序
+├── download.py # 数据下载程序
+├── train.py # 训练主程序
+├── predict.py # 预测主程序
+├── seq2seq_attn.py # 带注意力机制的翻译模型程序
+└── seq2seq_base.py # 无注意力机制的翻译模型程序
+```
+
+## 简介
+
+Sequence to Sequence (Seq2Seq),使用编码器-解码器(Encoder-Decoder)结构,用编码器将源序列编码成vector,再用解码器将该vector解码为目标序列。Seq2Seq 广泛应用于机器翻译,自动对话机器人,文档摘要自动生成,图片描述自动生成等任务中。
+
+本目录包含Seq2Seq的一个经典样例:机器翻译,实现了一个base model(不带attention机制),一个带attention机制的翻译模型。Seq2Seq翻译模型,模拟了人类在进行翻译类任务时的行为:先解析源语言,理解其含义,再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式,我们推荐参考飞桨官网[机器翻译案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/nlp_case/machine_translation/README.cn.html)。
+
+## 模型概览
+
+本模型中,在编码器方面,我们采用了基于LSTM的多层的RNN encoder;在解码器方面,我们使用了带注意力(Attention)机制的RNN decoder,并同时提供了一个不带注意力机制的解码器实现作为对比。在预测时我们使用柱搜索(beam search)算法来生成翻译的目标语句。
+
+## 数据介绍
+
+本教程使用[IWSLT'15 English-Vietnamese data ](https://nlp.stanford.edu/projects/nmt/)数据集中的英语到越南语的数据作为训练语料,tst2012的数据作为开发集,tst2013的数据作为测试集
+
+### 数据获取
+
+```
+python download.py
+```
+
+## 模型训练
+
+执行以下命令即可训练带有注意力机制的Seq2Seq机器翻译模型:
+
+```sh
+export CUDA_VISIBLE_DEVICES=0
+
+python train.py \
+ --src_lang en --tar_lang vi \
+ --attention True \
+ --num_layers 2 \
+ --hidden_size 512 \
+ --src_vocab_size 17191 \
+ --tar_vocab_size 7709 \
+ --batch_size 128 \
+ --dropout 0.2 \
+ --init_scale 0.1 \
+ --max_grad_norm 5.0 \
+ --train_data_prefix data/en-vi/train \
+ --eval_data_prefix data/en-vi/tst2012 \
+ --test_data_prefix data/en-vi/tst2013 \
+ --vocab_prefix data/en-vi/vocab \
+ --use_gpu True \
+ --model_path ./attention_models
+```
+
+可以通过修改 `attention` 参数为False来训练不带注意力机制的Seq2Seq模型,各参数的具体说明请参阅 `args.py` 。训练程序会在每个epoch训练结束之后,save一次模型。
+
+默认使用动态图模式进行训练,可以通过设置 `eager_run` 参数为False来以静态图模式进行训练,如下:
+
+```sh
+export CUDA_VISIBLE_DEVICES=0
+
+python train.py \
+ --src_lang en --tar_lang vi \
+ --attention True \
+ --num_layers 2 \
+ --hidden_size 512 \
+ --src_vocab_size 17191 \
+ --tar_vocab_size 7709 \
+ --batch_size 128 \
+ --dropout 0.2 \
+ --init_scale 0.1 \
+ --max_grad_norm 5.0 \
+ --train_data_prefix data/en-vi/train \
+ --eval_data_prefix data/en-vi/tst2012 \
+ --test_data_prefix data/en-vi/tst2013 \
+ --vocab_prefix data/en-vi/vocab \
+ --use_gpu True \
+ --model_path ./attention_models \
+ --eager_run False
+```
+
+## 模型预测
+
+训练完成之后,可以使用保存的模型(由 `--reload_model` 指定)对test的数据集(由 `--infer_file` 指定)进行beam search解码,命令如下:
+
+```sh
+export CUDA_VISIBLE_DEVICES=0
+
+python infer.py \
+ --attention True \
+ --src_lang en --tar_lang vi \
+ --num_layers 2 \
+ --hidden_size 512 \
+ --src_vocab_size 17191 \
+ --tar_vocab_size 7709 \
+ --batch_size 128 \
+ --dropout 0.2 \
+ --init_scale 0.1 \
+ --max_grad_norm 5.0 \
+ --vocab_prefix data/en-vi/vocab \
+ --infer_file data/en-vi/tst2013.en \
+ --reload_model attention_models/10 \
+ --infer_output_file infer_output.txt \
+ --beam_size 10 \
+ --use_gpu True
+```
+
+各参数的具体说明请参阅 `args.py` ,注意预测时所用模型超参数需和训练时一致。和训练类似,预测时同样可以以静态图模式进行,如下:
+
+```sh
+export CUDA_VISIBLE_DEVICES=0
+
+python infer.py \
+ --attention True \
+ --src_lang en --tar_lang vi \
+ --num_layers 2 \
+ --hidden_size 512 \
+ --src_vocab_size 17191 \
+ --tar_vocab_size 7709 \
+ --batch_size 128 \
+ --dropout 0.2 \
+ --init_scale 0.1 \
+ --max_grad_norm 5.0 \
+ --vocab_prefix data/en-vi/vocab \
+ --infer_file data/en-vi/tst2013.en \
+ --reload_model attention_models/10 \
+ --infer_output_file infer_output.txt \
+ --beam_size 10 \
+ --use_gpu True \
+ --eager_run False
+```
+
+## 效果评价
+
+使用 [*multi-bleu.perl*](https://github.com/moses-smt/mosesdecoder.git) 工具来评价模型预测的翻译质量,使用方法如下:
+
+```sh
+mosesdecoder/scripts/generic/multi-bleu.perl tst2013.vi < infer_output.txt
+```
+
+每个模型分别训练了10次,单次取第10个epoch保存的模型进行预测,取beam_size=10。效果如下(为了便于观察,对10次结果按照升序进行了排序):
+
+```
+> no attention
+tst2012 BLEU:
+[10.75 10.85 10.9 10.94 10.97 11.01 11.01 11.04 11.13 11.4]
+tst2013 BLEU:
+[10.71 10.71 10.74 10.76 10.91 10.94 11.02 11.16 11.21 11.44]
+
+> with attention
+tst2012 BLEU:
+[21.14 22.34 22.54 22.65 22.71 22.71 23.08 23.15 23.3 23.4]
+tst2013 BLEU:
+[23.41 24.79 25.11 25.12 25.19 25.24 25.39 25.61 25.61 25.63]
+```
diff --git a/examples/seq2seq/args.py b/examples/seq2seq/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..94b07cd2cfed8a87c6b9bc9b95168ad7f0f6b924
--- /dev/null
+++ b/examples/seq2seq/args.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import distutils.util
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--train_data_prefix", type=str, help="file prefix for train data")
+ parser.add_argument(
+ "--eval_data_prefix", type=str, help="file prefix for eval data")
+ parser.add_argument(
+ "--test_data_prefix", type=str, help="file prefix for test data")
+ parser.add_argument(
+ "--vocab_prefix", type=str, help="file prefix for vocab")
+ parser.add_argument("--src_lang", type=str, help="source language suffix")
+ parser.add_argument("--tar_lang", type=str, help="target language suffix")
+
+ parser.add_argument(
+ "--attention",
+ type=eval,
+ default=False,
+ help="Whether use attention model")
+
+ parser.add_argument(
+ "--optimizer",
+ type=str,
+ default='adam',
+ help="optimizer to use, only supprt[sgd|adam]")
+
+ parser.add_argument(
+ "--learning_rate",
+ type=float,
+ default=0.001,
+ help="learning rate for optimizer")
+
+ parser.add_argument(
+ "--num_layers",
+ type=int,
+ default=1,
+ help="layers number of encoder and decoder")
+ parser.add_argument(
+ "--hidden_size",
+ type=int,
+ default=100,
+ help="hidden size of encoder and decoder")
+ parser.add_argument("--src_vocab_size", type=int, help="source vocab size")
+ parser.add_argument("--tar_vocab_size", type=int, help="target vocab size")
+
+ parser.add_argument(
+ "--batch_size", type=int, help="batch size of each step")
+
+ parser.add_argument(
+ "--max_epoch", type=int, default=12, help="max epoch for the training")
+
+ parser.add_argument(
+ "--max_len",
+ type=int,
+ default=50,
+ help="max length for source and target sentence")
+ parser.add_argument(
+ "--dropout", type=float, default=0.0, help="drop probability")
+ parser.add_argument(
+ "--init_scale",
+ type=float,
+ default=0.0,
+ help="init scale for parameter")
+ parser.add_argument(
+ "--max_grad_norm",
+ type=float,
+ default=5.0,
+ help="max grad norm for global norm clip")
+
+ parser.add_argument(
+ "--log_freq",
+ type=int,
+ default=100,
+ help="The frequency to print training logs")
+
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ default='model',
+ help="model path for model to save")
+
+ parser.add_argument(
+ "--reload_model", type=str, help="reload model to inference")
+
+ parser.add_argument(
+ "--infer_file", type=str, help="file name for inference")
+ parser.add_argument(
+ "--infer_output_file",
+ type=str,
+ default='infer_output',
+ help="file name for inference output")
+ parser.add_argument(
+ "--beam_size", type=int, default=10, help="file name for inference")
+
+ parser.add_argument(
+ '--use_gpu',
+ type=eval,
+ default=False,
+ help='Whether using gpu [True|False]')
+
+ parser.add_argument(
+ '--eager_run', type=eval, default=False, help='Whether to use dygraph')
+
+ parser.add_argument(
+ "--enable_ce",
+ action='store_true',
+ help="The flag indicating whether to run the task "
+ "for continuous evaluation.")
+
+ parser.add_argument(
+ "--profile", action='store_true', help="Whether enable the profile.")
+ # NOTE: profiler args, used for benchmark
+ parser.add_argument(
+ "--profiler_path",
+ type=str,
+ default='./seq2seq.profile',
+ help="the profiler output file path. (used for benchmark)")
+ args = parser.parse_args()
+ return args
diff --git a/examples/seq2seq/download.py b/examples/seq2seq/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2981f452f832fb70e6962d0f5aec7db967c2d9
--- /dev/null
+++ b/examples/seq2seq/download.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Script for downloading training data.
+'''
+import os
+import urllib
+import sys
+
+if sys.version_info >= (3, 0):
+ import urllib.request
+import zipfile
+
+URLLIB = urllib
+if sys.version_info >= (3, 0):
+ URLLIB = urllib.request
+
+remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi'
+base_path = 'data'
+tar_path = os.path.join(base_path, 'en-vi')
+filenames = [
+ 'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en',
+ 'tst2013.vi', 'vocab.en', 'vocab.vi'
+]
+
+
+def main(arguments):
+ print("Downloading data......")
+
+ if not os.path.exists(tar_path):
+ if not os.path.exists(base_path):
+ os.mkdir(base_path)
+ os.mkdir(tar_path)
+
+ for filename in filenames:
+ url = remote_path + '/' + filename
+ tar_file = os.path.join(tar_path, filename)
+ URLLIB.urlretrieve(url, tar_file)
+ print("Downloaded sucess......")
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/examples/seq2seq/predict.py b/examples/seq2seq/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e3e87fddf05d453ed984a49d42fcac0f833cab
--- /dev/null
+++ b/examples/seq2seq/predict.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import io
+import random
+from functools import partial
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.io import DataLoader
+
+from hapi.model import Input, set_device
+from args import parse_args
+from seq2seq_base import BaseInferModel
+from seq2seq_attn import AttentionInferModel
+from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_infer_input
+
+
+def post_process_seq(seq, bos_idx, eos_idx, output_bos=False,
+ output_eos=False):
+ """
+ Post-process the decoded sequence.
+ """
+ eos_pos = len(seq) - 1
+ for i, idx in enumerate(seq):
+ if idx == eos_idx:
+ eos_pos = i
+ break
+ seq = [
+ idx for idx in seq[:eos_pos + 1]
+ if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)
+ ]
+ return seq
+
+
+def do_predict(args):
+ device = set_device("gpu" if args.use_gpu else "cpu")
+ fluid.enable_dygraph(device) if args.eager_run else None
+
+ # define model
+ inputs = [
+ Input(
+ [None, None], "int64", name="src_word"),
+ Input(
+ [None], "int64", name="src_length"),
+ ]
+
+ # def dataloader
+ dataset = Seq2SeqDataset(
+ fpattern=args.infer_file,
+ src_vocab_fpath=args.vocab_prefix + "." + args.src_lang,
+ trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang,
+ token_delimiter=None,
+ start_mark="",
+ end_mark="",
+ unk_mark="")
+ trg_idx2word = Seq2SeqDataset.load_dict(
+ dict_path=args.vocab_prefix + "." + args.tar_lang, reverse=True)
+ (args.src_vocab_size, args.trg_vocab_size, bos_id, eos_id,
+ unk_id) = dataset.get_vocab_summary()
+ batch_sampler = Seq2SeqBatchSampler(
+ dataset=dataset, use_token_batch=False, batch_size=args.batch_size)
+ data_loader = DataLoader(
+ dataset=dataset,
+ batch_sampler=batch_sampler,
+ places=device,
+ feed_list=None
+ if fluid.in_dygraph_mode() else [x.forward() for x in inputs],
+ collate_fn=partial(
+ prepare_infer_input, bos_id=bos_id, eos_id=eos_id, pad_id=eos_id),
+ num_workers=0,
+ return_list=True)
+
+ model_maker = AttentionInferModel if args.attention else BaseInferModel
+ model = model_maker(
+ args.src_vocab_size,
+ args.tar_vocab_size,
+ args.hidden_size,
+ args.hidden_size,
+ args.num_layers,
+ args.dropout,
+ bos_id=bos_id,
+ eos_id=eos_id,
+ beam_size=args.beam_size,
+ max_out_len=256)
+
+ model.prepare(inputs=inputs)
+
+ # load the trained model
+ assert args.reload_model, (
+ "Please set reload_model to load the infer model.")
+ model.load(args.reload_model)
+
+ # TODO(guosheng): use model.predict when support variant length
+ with io.open(args.infer_output_file, 'w', encoding='utf-8') as f:
+ for data in data_loader():
+ finished_seq = model.test_batch(inputs=flatten(data))[0]
+ finished_seq = finished_seq[:, :, np.newaxis] if len(
+ finished_seq.shape) == 2 else finished_seq
+ finished_seq = np.transpose(finished_seq, [0, 2, 1])
+ for ins in finished_seq:
+ for beam_idx, beam in enumerate(ins):
+ id_list = post_process_seq(beam, bos_id, eos_id)
+ word_list = [trg_idx2word[id] for id in id_list]
+ sequence = " ".join(word_list) + "\n"
+ f.write(sequence)
+ break
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ do_predict(args)
diff --git a/examples/seq2seq/reader.py b/examples/seq2seq/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa88a81058e6304af7e7cbd74a839fb2e46bd41
--- /dev/null
+++ b/examples/seq2seq/reader.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import six
+import os
+import io
+import itertools
+from functools import partial
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+
+
+def create_data_loader(args, device, for_train=True):
+ data_loaders = [None, None]
+ data_prefixes = [args.train_data_prefix, args.eval_data_prefix
+ ] if args.eval_data_prefix else [args.train_data_prefix]
+ for i, data_prefix in enumerate(data_prefixes):
+ dataset = Seq2SeqDataset(
+ fpattern=data_prefix + "." + args.src_lang,
+ trg_fpattern=data_prefix + "." + args.tar_lang,
+ src_vocab_fpath=args.vocab_prefix + "." + args.src_lang,
+ trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang,
+ token_delimiter=None,
+ start_mark="",
+ end_mark="",
+ unk_mark="",
+ max_length=args.max_len if i == 0 else None,
+ truncate=True,
+ trg_add_bos_eos=True)
+ (args.src_vocab_size, args.tar_vocab_size, bos_id, eos_id,
+ unk_id) = dataset.get_vocab_summary()
+ batch_sampler = Seq2SeqBatchSampler(
+ dataset=dataset,
+ use_token_batch=False,
+ batch_size=args.batch_size,
+ pool_size=args.batch_size * 20,
+ sort_type=SortType.POOL,
+ shuffle=False if args.enable_ce else True,
+ distribute_mode=True if i == 0 else False)
+ data_loader = DataLoader(
+ dataset=dataset,
+ batch_sampler=batch_sampler,
+ places=device,
+ collate_fn=partial(
+ prepare_train_input,
+ bos_id=bos_id,
+ eos_id=eos_id,
+ pad_id=eos_id),
+ num_workers=0,
+ return_list=True)
+ data_loaders[i] = data_loader
+ return data_loaders
+
+
+def prepare_train_input(insts, bos_id, eos_id, pad_id):
+ src, src_length = pad_batch_data(
+ [inst[0] for inst in insts], pad_id=pad_id)
+ trg, trg_length = pad_batch_data(
+ [inst[1] for inst in insts], pad_id=pad_id)
+ trg_length = trg_length - 1
+ return src, src_length, trg[:, :-1], trg_length, trg[:, 1:, np.newaxis]
+
+
+def prepare_infer_input(insts, bos_id, eos_id, pad_id):
+ src, src_length = pad_batch_data(insts, pad_id=pad_id)
+ return src, src_length
+
+
+def pad_batch_data(insts, pad_id):
+ """
+ Pad the instances to the max sequence length in batch, and generate the
+ corresponding position data and attention bias.
+ """
+ inst_lens = np.array([len(inst) for inst in insts], dtype="int64")
+ max_len = np.max(inst_lens)
+ inst_data = np.array(
+ [inst + [pad_id] * (max_len - len(inst)) for inst in insts],
+ dtype="int64")
+ return inst_data, inst_lens
+
+
+class SortType(object):
+ GLOBAL = 'global'
+ POOL = 'pool'
+ NONE = "none"
+
+
+class Converter(object):
+ def __init__(self, vocab, beg, end, unk, delimiter, add_beg, add_end):
+ self._vocab = vocab
+ self._beg = beg
+ self._end = end
+ self._unk = unk
+ self._delimiter = delimiter
+ self._add_beg = add_beg
+ self._add_end = add_end
+
+ def __call__(self, sentence):
+ return ([self._beg] if self._add_beg else []) + [
+ self._vocab.get(w, self._unk)
+ for w in sentence.split(self._delimiter)
+ ] + ([self._end] if self._add_end else [])
+
+
+class ComposedConverter(object):
+ def __init__(self, converters):
+ self._converters = converters
+
+ def __call__(self, fields):
+ return [
+ converter(field)
+ for field, converter in zip(fields, self._converters)
+ ]
+
+
+class SentenceBatchCreator(object):
+ def __init__(self, batch_size):
+ self.batch = []
+ self._batch_size = batch_size
+
+ def append(self, info):
+ self.batch.append(info)
+ if len(self.batch) == self._batch_size:
+ tmp = self.batch
+ self.batch = []
+ return tmp
+
+
+class TokenBatchCreator(object):
+ def __init__(self, batch_size):
+ self.batch = []
+ self.max_len = -1
+ self._batch_size = batch_size
+
+ def append(self, info):
+ cur_len = info.max_len
+ max_len = max(self.max_len, cur_len)
+ if max_len * (len(self.batch) + 1) > self._batch_size:
+ result = self.batch
+ self.batch = [info]
+ self.max_len = cur_len
+ return result
+ else:
+ self.max_len = max_len
+ self.batch.append(info)
+
+
+class SampleInfo(object):
+ def __init__(self, i, lens):
+ self.i = i
+ self.lens = lens
+ self.max_len = lens[0] # to be consitent with the original reader
+
+ def get_ranges(self, min_length=None, max_length=None, truncate=False):
+ ranges = []
+ # source
+ if (min_length is None or self.lens[0] >= min_length) and (
+ max_length is None or self.lens[0] <= max_length or truncate):
+ end = max_length if truncate and max_length else self.lens[0]
+ ranges.append([0, end])
+ # target
+ if len(self.lens) == 2:
+ if (min_length is None or self.lens[1] >= min_length) and (
+ max_length is None or self.lens[1] <= max_length + 2 or
+ truncate):
+ end = max_length + 2 if truncate and max_length else self.lens[
+ 1]
+ ranges.append([0, end])
+ return ranges if len(ranges) == len(self.lens) else None
+
+
+class MinMaxFilter(object):
+ def __init__(self, max_len, min_len, underlying_creator):
+ self._min_len = min_len
+ self._max_len = max_len
+ self._creator = underlying_creator
+
+ def append(self, info):
+ if (self._min_len is None or info.min_len >= self._min_len) and (
+ self._max_len is None or info.max_len <= self._max_len):
+ return self._creator.append(info)
+
+ @property
+ def batch(self):
+ return self._creator.batch
+
+
+class Seq2SeqDataset(Dataset):
+ def __init__(self,
+ src_vocab_fpath,
+ trg_vocab_fpath,
+ fpattern,
+ field_delimiter="\t",
+ token_delimiter=" ",
+ start_mark="",
+ end_mark="",
+ unk_mark="",
+ trg_fpattern=None,
+ trg_add_bos_eos=False,
+ byte_data=False,
+ min_length=None,
+ max_length=None,
+ truncate=False):
+ if byte_data:
+ # The WMT16 bpe data used here seems including bytes can not be
+ # decoded by utf8. Thus convert str to bytes, and use byte data
+ field_delimiter = field_delimiter.encode("utf8")
+ token_delimiter = token_delimiter.encode("utf8")
+ start_mark = start_mark.encode("utf8")
+ end_mark = end_mark.encode("utf8")
+ unk_mark = unk_mark.encode("utf8")
+ self._byte_data = byte_data
+ self._src_vocab = self.load_dict(src_vocab_fpath, byte_data=byte_data)
+ self._trg_vocab = self.load_dict(trg_vocab_fpath, byte_data=byte_data)
+ self._bos_idx = self._src_vocab[start_mark]
+ self._eos_idx = self._src_vocab[end_mark]
+ self._unk_idx = self._src_vocab[unk_mark]
+ self._field_delimiter = field_delimiter
+ self._token_delimiter = token_delimiter
+ self._min_length = min_length
+ self._max_length = max_length
+ self._truncate = truncate
+ self._trg_add_bos_eos = trg_add_bos_eos
+ self.load_src_trg_ids(fpattern, trg_fpattern)
+
+ def load_src_trg_ids(self, fpattern, trg_fpattern=None):
+ src_converter = Converter(
+ vocab=self._src_vocab,
+ beg=self._bos_idx,
+ end=self._eos_idx,
+ unk=self._unk_idx,
+ delimiter=self._token_delimiter,
+ add_beg=False,
+ add_end=False)
+
+ trg_converter = Converter(
+ vocab=self._trg_vocab,
+ beg=self._bos_idx,
+ end=self._eos_idx,
+ unk=self._unk_idx,
+ delimiter=self._token_delimiter,
+ add_beg=True if self._trg_add_bos_eos else False,
+ add_end=True if self._trg_add_bos_eos else False)
+
+ converters = ComposedConverter([src_converter, trg_converter])
+
+ self._src_seq_ids = []
+ self._trg_seq_ids = []
+ self._sample_infos = []
+
+ slots = [self._src_seq_ids, self._trg_seq_ids]
+ for i, line in enumerate(self._load_lines(fpattern, trg_fpattern)):
+ fields = converters(line)
+ lens = [len(field) for field in fields]
+ sample = SampleInfo(i, lens)
+ field_ranges = sample.get_ranges(self._min_length,
+ self._max_length, self._truncate)
+ if field_ranges:
+ for field, field_range, slot in zip(fields, field_ranges,
+ slots):
+ slot.append(field[field_range[0]:field_range[1]])
+ self._sample_infos.append(sample)
+
+ def _load_lines(self, fpattern, trg_fpattern=None):
+ fpaths = glob.glob(fpattern)
+ fpaths = sorted(fpaths) # TODO: Add custum sort
+ assert len(fpaths) > 0, "no matching file to the provided data path"
+
+ (f_mode, f_encoding,
+ endl) = ("rb", None, b"\n") if self._byte_data else ("r", "utf8",
+ "\n")
+ if trg_fpattern is None:
+ for fpath in fpaths:
+ with io.open(fpath, f_mode, encoding=f_encoding) as f:
+ for line in f:
+ fields = line.strip(endl).split(self._field_delimiter)
+ yield fields
+ else:
+ # separated source and target language data files
+ # assume we can get aligned data by sort the two language files
+ # TODO: Need more rigorous check
+ trg_fpaths = glob.glob(trg_fpattern)
+ trg_fpaths = sorted(trg_fpaths)
+ assert len(fpaths) == len(
+ trg_fpaths
+ ), "the number of source language data files must equal \
+ with that of source language"
+
+ for fpath, trg_fpath in zip(fpaths, trg_fpaths):
+ with io.open(fpath, f_mode, encoding=f_encoding) as f:
+ with io.open(
+ trg_fpath, f_mode, encoding=f_encoding) as trg_f:
+ for line in zip(f, trg_f):
+ fields = [field.strip(endl) for field in line]
+ yield fields
+
+ @staticmethod
+ def load_dict(dict_path, reverse=False, byte_data=False):
+ word_dict = {}
+ (f_mode, f_encoding,
+ endl) = ("rb", None, b"\n") if byte_data else ("r", "utf8", "\n")
+ with io.open(dict_path, f_mode, encoding=f_encoding) as fdict:
+ for idx, line in enumerate(fdict):
+ if reverse:
+ word_dict[idx] = line.strip(endl)
+ else:
+ word_dict[line.strip(endl)] = idx
+ return word_dict
+
+ def get_vocab_summary(self):
+ return len(self._src_vocab), len(
+ self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx
+
+ def __getitem__(self, idx):
+ return (self._src_seq_ids[idx], self._trg_seq_ids[idx]
+ ) if self._trg_seq_ids else self._src_seq_ids[idx]
+
+ def __len__(self):
+ return len(self._sample_infos)
+
+
+class Seq2SeqBatchSampler(BatchSampler):
+ def __init__(self,
+ dataset,
+ batch_size,
+ pool_size=10000,
+ sort_type=SortType.NONE,
+ min_length=None,
+ max_length=None,
+ shuffle=False,
+ shuffle_batch=False,
+ use_token_batch=False,
+ clip_last_batch=False,
+ distribute_mode=True,
+ seed=0):
+ for arg, value in locals().items():
+ if arg != "self":
+ setattr(self, "_" + arg, value)
+ self._random = np.random
+ self._random.seed(seed)
+ # for multi-devices
+ self._distribute_mode = distribute_mode
+ self._nranks = ParallelEnv().nranks
+ self._local_rank = ParallelEnv().local_rank
+ self._device_id = ParallelEnv().dev_id
+
+ def __iter__(self):
+ # global sort or global shuffle
+ if self._sort_type == SortType.GLOBAL:
+ infos = sorted(
+ self._dataset._sample_infos, key=lambda x: x.max_len)
+ else:
+ if self._shuffle:
+ infos = self._dataset._sample_infos
+ self._random.shuffle(infos)
+ else:
+ infos = self._dataset._sample_infos
+
+ if self._sort_type == SortType.POOL:
+ reverse = True
+ for i in range(0, len(infos), self._pool_size):
+ # to avoid placing short next to long sentences
+ reverse = False # not reverse
+ infos[i:i + self._pool_size] = sorted(
+ infos[i:i + self._pool_size],
+ key=lambda x: x.max_len,
+ reverse=reverse)
+
+ batches = []
+ batch_creator = TokenBatchCreator(
+ self.
+ _batch_size) if self._use_token_batch else SentenceBatchCreator(
+ self._batch_size * self._nranks)
+ batch_creator = MinMaxFilter(self._max_length, self._min_length,
+ batch_creator)
+
+ for info in infos:
+ batch = batch_creator.append(info)
+ if batch is not None:
+ batches.append(batch)
+
+ if not self._clip_last_batch and len(batch_creator.batch) != 0:
+ batches.append(batch_creator.batch)
+
+ if self._shuffle_batch:
+ self._random.shuffle(batches)
+
+ if not self._use_token_batch:
+ # when producing batches according to sequence number, to confirm
+ # neighbor batches which would be feed and run parallel have similar
+ # length (thus similar computational cost) after shuffle, we as take
+ # them as a whole when shuffling and split here
+ batches = [[
+ batch[self._batch_size * i:self._batch_size * (i + 1)]
+ for i in range(self._nranks)
+ ] for batch in batches]
+ batches = list(itertools.chain.from_iterable(batches))
+
+ # for multi-device
+ for batch_id, batch in enumerate(batches):
+ if not self._distribute_mode or (
+ batch_id % self._nranks == self._local_rank):
+ batch_indices = [info.i for info in batch]
+ yield batch_indices
+ if self._distribute_mode and len(batches) % self._nranks != 0:
+ if self._local_rank >= len(batches) % self._nranks:
+ # use previous data to pad
+ yield batch_indices
+
+ def __len__(self):
+ if not self._use_token_batch:
+ batch_number = (
+ len(self._dataset) + self._batch_size * self._nranks - 1) // (
+ self._batch_size * self._nranks)
+ else:
+ # TODO(guosheng): fix the uncertain length
+ batch_number = 1
+ return batch_number
diff --git a/examples/seq2seq/seq2seq_attn.py b/examples/seq2seq/seq2seq_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9cc089ca2133549fbdd08ed600e69d4235e08c
--- /dev/null
+++ b/examples/seq2seq/seq2seq_attn.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid import ParamAttr
+from paddle.fluid.initializer import UniformInitializer
+from paddle.fluid.dygraph import Embedding, Linear, Layer
+from paddle.fluid.layers import BeamSearchDecoder
+
+from hapi.model import Model, Loss
+from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell
+
+from seq2seq_base import Encoder
+
+
+class AttentionLayer(Layer):
+ def __init__(self, hidden_size, bias=False, init_scale=0.1):
+ super(AttentionLayer, self).__init__()
+ self.input_proj = Linear(
+ hidden_size,
+ hidden_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)),
+ bias_attr=bias)
+ self.output_proj = Linear(
+ hidden_size + hidden_size,
+ hidden_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)),
+ bias_attr=bias)
+
+ def forward(self, hidden, encoder_output, encoder_padding_mask):
+ # query = self.input_proj(hidden)
+ encoder_output = self.input_proj(encoder_output)
+ attn_scores = layers.matmul(
+ layers.unsqueeze(hidden, [1]), encoder_output, transpose_y=True)
+ if encoder_padding_mask is not None:
+ attn_scores = layers.elementwise_add(attn_scores,
+ encoder_padding_mask)
+ attn_scores = layers.softmax(attn_scores)
+ attn_out = layers.squeeze(
+ layers.matmul(attn_scores, encoder_output), [1])
+ attn_out = layers.concat([attn_out, hidden], 1)
+ attn_out = self.output_proj(attn_out)
+ return attn_out
+
+
+class DecoderCell(RNNCell):
+ def __init__(self,
+ num_layers,
+ input_size,
+ hidden_size,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(DecoderCell, self).__init__()
+ self.dropout_prob = dropout_prob
+ # use add_sublayer to add multi-layers
+ self.lstm_cells = []
+ for i in range(num_layers):
+ self.lstm_cells.append(
+ self.add_sublayer(
+ "lstm_%d" % i,
+ BasicLSTMCell(
+ input_size=input_size + hidden_size
+ if i == 0 else hidden_size,
+ hidden_size=hidden_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)))))
+ self.attention_layer = AttentionLayer(hidden_size)
+
+ def forward(self,
+ step_input,
+ states,
+ encoder_output,
+ encoder_padding_mask=None):
+ lstm_states, input_feed = states
+ new_lstm_states = []
+ step_input = layers.concat([step_input, input_feed], 1)
+ for i, lstm_cell in enumerate(self.lstm_cells):
+ out, new_lstm_state = lstm_cell(step_input, lstm_states[i])
+ step_input = layers.dropout(
+ out,
+ self.dropout_prob,
+ dropout_implementation='upscale_in_train'
+ ) if self.dropout_prob > 0 else out
+ new_lstm_states.append(new_lstm_state)
+ out = self.attention_layer(step_input, encoder_output,
+ encoder_padding_mask)
+ return out, [new_lstm_states, out]
+
+
+class Decoder(Layer):
+ def __init__(self,
+ vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(Decoder, self).__init__()
+ self.embedder = Embedding(
+ size=[vocab_size, embed_dim],
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)))
+ self.lstm_attention = RNN(DecoderCell(
+ num_layers, embed_dim, hidden_size, dropout_prob, init_scale),
+ is_reverse=False,
+ time_major=False)
+ self.output_layer = Linear(
+ hidden_size,
+ vocab_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)),
+ bias_attr=False)
+
+ def forward(self, target, decoder_initial_states, encoder_output,
+ encoder_padding_mask):
+ inputs = self.embedder(target)
+ decoder_output, _ = self.lstm_attention(
+ inputs,
+ initial_states=decoder_initial_states,
+ encoder_output=encoder_output,
+ encoder_padding_mask=encoder_padding_mask)
+ predict = self.output_layer(decoder_output)
+ return predict
+
+
+class AttentionModel(Model):
+ def __init__(self,
+ src_vocab_size,
+ trg_vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(AttentionModel, self).__init__()
+ self.hidden_size = hidden_size
+ self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
+ num_layers, dropout_prob, init_scale)
+ self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
+ num_layers, dropout_prob, init_scale)
+
+ def forward(self, src, src_length, trg):
+ # encoder
+ encoder_output, encoder_final_state = self.encoder(src, src_length)
+
+ # decoder initial states: use input_feed and the structure is
+ # [[h,c] * num_layers, input_feed], consistent with DecoderCell.states
+ decoder_initial_states = [
+ encoder_final_state,
+ self.decoder.lstm_attention.cell.get_initial_states(
+ batch_ref=encoder_output, shape=[self.hidden_size])
+ ]
+ # attention mask to avoid paying attention on padddings
+ src_mask = layers.sequence_mask(
+ src_length,
+ maxlen=layers.shape(src)[1],
+ dtype=encoder_output.dtype)
+ encoder_padding_mask = (src_mask - 1.0) * 1e9
+ encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
+
+ # decoder with attentioon
+ predict = self.decoder(trg, decoder_initial_states, encoder_output,
+ encoder_padding_mask)
+ return predict
+
+
+class AttentionInferModel(AttentionModel):
+ def __init__(self,
+ src_vocab_size,
+ trg_vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ bos_id=0,
+ eos_id=1,
+ beam_size=4,
+ max_out_len=256):
+ args = dict(locals())
+ args.pop("self")
+ args.pop("__class__", None) # py3
+ self.bos_id = args.pop("bos_id")
+ self.eos_id = args.pop("eos_id")
+ self.beam_size = args.pop("beam_size")
+ self.max_out_len = args.pop("max_out_len")
+ super(AttentionInferModel, self).__init__(**args)
+ # dynamic decoder for inference
+ decoder = BeamSearchDecoder(
+ self.decoder.lstm_attention.cell,
+ start_token=bos_id,
+ end_token=eos_id,
+ beam_size=beam_size,
+ embedding_fn=self.decoder.embedder,
+ output_fn=self.decoder.output_layer)
+ self.beam_search_decoder = DynamicDecode(
+ decoder, max_step_num=max_out_len, is_test=True)
+
+ def forward(self, src, src_length):
+ # encoding
+ encoder_output, encoder_final_state = self.encoder(src, src_length)
+
+ # decoder initial states
+ decoder_initial_states = [
+ encoder_final_state,
+ self.decoder.lstm_attention.cell.get_initial_states(
+ batch_ref=encoder_output, shape=[self.hidden_size])
+ ]
+ # attention mask to avoid paying attention on padddings
+ src_mask = layers.sequence_mask(
+ src_length,
+ maxlen=layers.shape(src)[1],
+ dtype=encoder_output.dtype)
+ encoder_padding_mask = (src_mask - 1.0) * 1e9
+ encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
+
+ # Tile the batch dimension with beam_size
+ encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch(
+ encoder_output, self.beam_size)
+ encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch(
+ encoder_padding_mask, self.beam_size)
+
+ # dynamic decoding with beam search
+ rs, _ = self.beam_search_decoder(
+ inits=decoder_initial_states,
+ encoder_output=encoder_output,
+ encoder_padding_mask=encoder_padding_mask)
+ return rs
diff --git a/examples/seq2seq/seq2seq_base.py b/examples/seq2seq/seq2seq_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28e2dc52935526d69d78ae73bfd48c92528b93c
--- /dev/null
+++ b/examples/seq2seq/seq2seq_base.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid import ParamAttr
+from paddle.fluid.initializer import UniformInitializer
+from paddle.fluid.dygraph import Embedding, Linear, Layer
+from paddle.fluid.layers import BeamSearchDecoder
+
+from hapi.model import Model, Loss
+from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell
+
+
+class CrossEntropyCriterion(Loss):
+ def __init__(self):
+ super(CrossEntropyCriterion, self).__init__()
+
+ def forward(self, outputs, labels):
+ predict, (trg_length, label) = outputs[0], labels
+ # for target padding mask
+ mask = layers.sequence_mask(
+ trg_length, maxlen=layers.shape(predict)[1], dtype=predict.dtype)
+
+ cost = layers.softmax_with_cross_entropy(
+ logits=predict, label=label, soft_label=False)
+ masked_cost = layers.elementwise_mul(cost, mask, axis=0)
+ batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0])
+ seq_cost = layers.reduce_sum(batch_mean_cost)
+ return seq_cost
+
+
+class EncoderCell(RNNCell):
+ def __init__(self,
+ num_layers,
+ input_size,
+ hidden_size,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(EncoderCell, self).__init__()
+ self.dropout_prob = dropout_prob
+ # use add_sublayer to add multi-layers
+ self.lstm_cells = []
+ for i in range(num_layers):
+ self.lstm_cells.append(
+ self.add_sublayer(
+ "lstm_%d" % i,
+ BasicLSTMCell(
+ input_size=input_size if i == 0 else hidden_size,
+ hidden_size=hidden_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)))))
+
+ def forward(self, step_input, states):
+ new_states = []
+ for i, lstm_cell in enumerate(self.lstm_cells):
+ out, new_state = lstm_cell(step_input, states[i])
+ step_input = layers.dropout(
+ out,
+ self.dropout_prob,
+ dropout_implementation='upscale_in_train'
+ ) if self.dropout_prob > 0 else out
+ new_states.append(new_state)
+ return step_input, new_states
+
+ @property
+ def state_shape(self):
+ return [cell.state_shape for cell in self.lstm_cells]
+
+
+class Encoder(Layer):
+ def __init__(self,
+ vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(Encoder, self).__init__()
+ self.embedder = Embedding(
+ size=[vocab_size, embed_dim],
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)))
+ self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size,
+ dropout_prob, init_scale),
+ is_reverse=False,
+ time_major=False)
+
+ def forward(self, sequence, sequence_length):
+ inputs = self.embedder(sequence)
+ encoder_output, encoder_state = self.stack_lstm(
+ inputs, sequence_length=sequence_length)
+ return encoder_output, encoder_state
+
+
+DecoderCell = EncoderCell
+
+
+class Decoder(Layer):
+ def __init__(self,
+ vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(Decoder, self).__init__()
+ self.embedder = Embedding(
+ size=[vocab_size, embed_dim],
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)))
+ self.stack_lstm = RNN(DecoderCell(num_layers, embed_dim, hidden_size,
+ dropout_prob, init_scale),
+ is_reverse=False,
+ time_major=False)
+ self.output_layer = Linear(
+ hidden_size,
+ vocab_size,
+ param_attr=ParamAttr(initializer=UniformInitializer(
+ low=-init_scale, high=init_scale)),
+ bias_attr=False)
+
+ def forward(self, target, decoder_initial_states):
+ inputs = self.embedder(target)
+ decoder_output, _ = self.stack_lstm(
+ inputs, initial_states=decoder_initial_states)
+ predict = self.output_layer(decoder_output)
+ return predict
+
+
+class BaseModel(Model):
+ def __init__(self,
+ src_vocab_size,
+ trg_vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ init_scale=0.1):
+ super(BaseModel, self).__init__()
+ self.hidden_size = hidden_size
+ self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
+ num_layers, dropout_prob, init_scale)
+ self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
+ num_layers, dropout_prob, init_scale)
+
+ def forward(self, src, src_length, trg):
+ # encoder
+ encoder_output, encoder_final_states = self.encoder(src, src_length)
+
+ # decoder
+ predict = self.decoder(trg, encoder_final_states)
+ return predict
+
+
+class BaseInferModel(BaseModel):
+ def __init__(self,
+ src_vocab_size,
+ trg_vocab_size,
+ embed_dim,
+ hidden_size,
+ num_layers,
+ dropout_prob=0.,
+ bos_id=0,
+ eos_id=1,
+ beam_size=4,
+ max_out_len=256):
+ args = dict(locals())
+ args.pop("self")
+ args.pop("__class__", None) # py3
+ self.bos_id = args.pop("bos_id")
+ self.eos_id = args.pop("eos_id")
+ self.beam_size = args.pop("beam_size")
+ self.max_out_len = args.pop("max_out_len")
+ super(BaseInferModel, self).__init__(**args)
+ # dynamic decoder for inference
+ decoder = BeamSearchDecoder(
+ self.decoder.stack_lstm.cell,
+ start_token=bos_id,
+ end_token=eos_id,
+ beam_size=beam_size,
+ embedding_fn=self.decoder.embedder,
+ output_fn=self.decoder.output_layer)
+ self.beam_search_decoder = DynamicDecode(
+ decoder, max_step_num=max_out_len, is_test=True)
+
+ def forward(self, src, src_length):
+ # encoding
+ encoder_output, encoder_final_states = self.encoder(src, src_length)
+ # dynamic decoding with beam search
+ rs, _ = self.beam_search_decoder(inits=encoder_final_states)
+ return rs
diff --git a/examples/seq2seq/train.py b/examples/seq2seq/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7dc7698e31b1b5b935a63de66ee632956d3b102
--- /dev/null
+++ b/examples/seq2seq/train.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import random
+from functools import partial
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.io import DataLoader
+
+from hapi.model import Input, set_device
+from args import parse_args
+from seq2seq_base import BaseModel, CrossEntropyCriterion
+from seq2seq_attn import AttentionModel
+from reader import create_data_loader
+from utility import PPL, TrainCallback
+
+
+def do_train(args):
+ device = set_device("gpu" if args.use_gpu else "cpu")
+ fluid.enable_dygraph(device) if args.eager_run else None
+
+ if args.enable_ce:
+ fluid.default_main_program().random_seed = 102
+ fluid.default_startup_program().random_seed = 102
+
+ # define model
+ inputs = [
+ Input(
+ [None, None], "int64", name="src_word"),
+ Input(
+ [None], "int64", name="src_length"),
+ Input(
+ [None, None], "int64", name="trg_word"),
+ ]
+ labels = [
+ Input(
+ [None], "int64", name="trg_length"),
+ Input(
+ [None, None, 1], "int64", name="label"),
+ ]
+
+ # def dataloader
+ train_loader, eval_loader = create_data_loader(args, device)
+
+ model_maker = AttentionModel if args.attention else BaseModel
+ model = model_maker(args.src_vocab_size, args.tar_vocab_size,
+ args.hidden_size, args.hidden_size, args.num_layers,
+ args.dropout)
+ grad_clip = fluid.clip.GradientClipByGlobalNorm(
+ clip_norm=args.max_grad_norm)
+ optimizer = fluid.optimizer.Adam(
+ learning_rate=args.learning_rate,
+ parameter_list=model.parameters(),
+ grad_clip=grad_clip)
+
+ ppl_metric = PPL(reset_freq=100) # ppl for every 100 batches
+ model.prepare(
+ optimizer,
+ CrossEntropyCriterion(),
+ ppl_metric,
+ inputs=inputs,
+ labels=labels)
+ model.fit(train_data=train_loader,
+ eval_data=eval_loader,
+ epochs=args.max_epoch,
+ eval_freq=1,
+ save_freq=1,
+ save_dir=args.model_path,
+ callbacks=[TrainCallback(ppl_metric, args.log_freq)])
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ do_train(args)
diff --git a/examples/seq2seq/utility.py b/examples/seq2seq/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa0dd4a461d24d8a7799ff47c9d63a65bf87d401
--- /dev/null
+++ b/examples/seq2seq/utility.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle.fluid as fluid
+
+from hapi.metrics import Metric
+from hapi.callbacks import ProgBarLogger
+
+
+class TrainCallback(ProgBarLogger):
+ def __init__(self, ppl, log_freq, verbose=2):
+ super(TrainCallback, self).__init__(log_freq, verbose)
+ self.ppl = ppl
+
+ def on_train_begin(self, logs=None):
+ super(TrainCallback, self).on_train_begin(logs)
+ self.train_metrics = ["ppl"] # remove loss to not print it
+
+ def on_epoch_begin(self, epoch=None, logs=None):
+ super(TrainCallback, self).on_epoch_begin(epoch, logs)
+ self.ppl.reset()
+
+ def on_train_batch_end(self, step, logs=None):
+ logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"])
+ if step > 0 and step % self.ppl.reset_freq == 0:
+ self.ppl.reset()
+ super(TrainCallback, self).on_train_batch_end(step, logs)
+
+ def on_eval_begin(self, logs=None):
+ super(TrainCallback, self).on_eval_begin(logs)
+ self.eval_metrics = ["ppl"]
+ self.ppl.reset()
+
+ def on_eval_batch_end(self, step, logs=None):
+ logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"])
+ super(TrainCallback, self).on_eval_batch_end(step, logs)
+
+
+class PPL(Metric):
+ def __init__(self, reset_freq=100, name=None):
+ super(PPL, self).__init__()
+ self._name = name or "ppl"
+ self.reset_freq = reset_freq
+ self.reset()
+
+ def add_metric_op(self, pred, seq_length, label):
+ word_num = fluid.layers.reduce_sum(seq_length)
+ return word_num
+
+ def update(self, word_num):
+ self.word_count += word_num
+ return word_num
+
+ def reset(self):
+ self.total_loss = 0
+ self.word_count = 0
+
+ def accumulate(self):
+ return self.word_count
+
+ def name(self):
+ return self._name
+
+ def cal_acc_ppl(self, batch_loss, batch_size):
+ self.total_loss += batch_loss * batch_size
+ ppl = math.exp(self.total_loss / self.word_count)
+ return ppl
\ No newline at end of file
diff --git a/transformer/README.md b/examples/transformer/README.md
similarity index 99%
rename from transformer/README.md
rename to examples/transformer/README.md
index 2c4c22b91788a091fc9c08e303e5bcae7d80a4de..0c785de8a262105a53386c2a6f417e1d499fba34 100644
--- a/transformer/README.md
+++ b/examples/transformer/README.md
@@ -201,7 +201,7 @@ python -u predict.py \
--special_token '' '' '' \
--predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
--batch_size 32 \
- --init_from_params base_model_dygraph/step_100000/transformer \
+ --init_from_params big_model_dygraph/step_100000/transformer \
--beam_size 5 \
--max_out_len 255 \
--output_file predict.txt \
diff --git a/transformer/gen_data.sh b/examples/transformer/gen_data.sh
similarity index 100%
rename from transformer/gen_data.sh
rename to examples/transformer/gen_data.sh
diff --git a/transformer/images/multi_head_attention.png b/examples/transformer/images/multi_head_attention.png
similarity index 100%
rename from transformer/images/multi_head_attention.png
rename to examples/transformer/images/multi_head_attention.png
diff --git a/transformer/images/transformer_network.png b/examples/transformer/images/transformer_network.png
similarity index 100%
rename from transformer/images/transformer_network.png
rename to examples/transformer/images/transformer_network.png
diff --git a/transformer/predict.py b/examples/transformer/predict.py
similarity index 94%
rename from transformer/predict.py
rename to examples/transformer/predict.py
index b83d5403486c1e661a939663bad154735b29b37e..a6e14314f523d78dee2f770e69a21ae808cd8ad1 100644
--- a/transformer/predict.py
+++ b/examples/transformer/predict.py
@@ -14,9 +14,6 @@
import logging
import os
-import six
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from functools import partial
import numpy as np
@@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
-from model import Input, set_device
+from hapi.model import Input, set_device
from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler
-from transformer import InferTransformer, position_encoding_init
+from transformer import InferTransformer
def post_process_seq(seq, bos_idx, eos_idx, output_bos=False,
@@ -132,7 +129,7 @@ def do_predict(args):
# TODO: use model.predict when support variant length
f = open(args.output_file, "wb")
for data in data_loader():
- finished_seq = transformer.test(inputs=flatten(data))[0]
+ finished_seq = transformer.test_batch(inputs=flatten(data))[0]
finished_seq = np.transpose(finished_seq, [0, 2, 1])
for ins in finished_seq:
for beam_idx, beam in enumerate(ins):
diff --git a/transformer/reader.py b/examples/transformer/reader.py
similarity index 97%
rename from transformer/reader.py
rename to examples/transformer/reader.py
index c0d02dcfb5b526ff8407f9320f31836d42ae5e4b..f6891df960b66fb9b48bb65d36af46f4ec601fc9 100644
--- a/transformer/reader.py
+++ b/examples/transformer/reader.py
@@ -13,7 +13,7 @@
# limitations under the License.
import glob
-import six
+import sys
import os
import io
import itertools
@@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset
def create_data_loader(args, device):
- data_loaders = [None, None]
+ data_loaders = [(None, None)] * 2
data_files = [args.training_file, args.validation_file
] if args.validation_file else [args.training_file]
for i, data_file in enumerate(data_files):
@@ -65,7 +65,7 @@ def create_data_loader(args, device):
n_head=args.n_head),
num_workers=0, # TODO: use multi-process
return_list=True)
- data_loaders[i] = data_loader
+ data_loaders[i] = (data_loader, batch_sampler.__len__)
return data_loaders
@@ -289,7 +289,6 @@ class Seq2SeqDataset(Dataset):
start_mark="",
end_mark="",
unk_mark="",
- only_src=False,
trg_fpattern=None,
byte_data=False):
if byte_data:
@@ -477,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler):
for i in range(self._nranks)
] for batch in batches]
batches = list(itertools.chain.from_iterable(batches))
+ self.batch_number = (len(batches) + self._nranks - 1) // self._nranks
# for multi-device
for batch_id, batch in enumerate(batches):
@@ -490,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler):
yield batch_indices
def __len__(self):
+ if hasattr(self, "batch_number"): #
+ return self.batch_number
if not self._use_token_batch:
batch_number = (
len(self._dataset) + self._batch_size * self._nranks - 1) // (
self._batch_size * self._nranks)
else:
- # TODO(guosheng): fix the uncertain length
- batch_number = 1
+ # for uncertain batch number, the actual value is self.batch_number
+ batch_number = sys.maxsize
return batch_number
diff --git a/transformer/train.py b/examples/transformer/train.py
similarity index 83%
rename from transformer/train.py
rename to examples/transformer/train.py
index 04a61f83a0191a944d9b2611b3bca61f0bcf2a0a..94b52b4423839a0d7e01f0243cbb3d0f5907a4b0 100644
--- a/transformer/train.py
+++ b/examples/transformer/train.py
@@ -14,9 +14,6 @@
import logging
import os
-import six
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import numpy as np
import paddle
@@ -26,14 +23,18 @@ from paddle.io import DataLoader
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
-from model import Input, set_device
-from callbacks import ProgBarLogger
+from hapi.model import Input, set_device
+from hapi.callbacks import ProgBarLogger
from reader import create_data_loader
from transformer import Transformer, CrossEntropyCriterion
class TrainCallback(ProgBarLogger):
- def __init__(self, args, verbose=2):
+ def __init__(self,
+ args,
+ verbose=2,
+ train_steps_fn=None,
+ eval_steps_fn=None):
# TODO(guosheng): save according to step
super(TrainCallback, self).__init__(args.print_step, verbose)
# the best cross-entropy value with label smoothing
@@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger):
(1. - args.label_smooth_eps)) + args.label_smooth_eps *
np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
self.loss_normalizer = loss_normalizer
+ self.train_steps_fn = train_steps_fn
+ self.eval_steps_fn = eval_steps_fn
def on_train_begin(self, logs=None):
super(TrainCallback, self).on_train_begin(logs)
self.train_metrics += ["normalized loss", "ppl"]
+ def on_train_batch_begin(self, step, logs=None):
+ if step == 0 and self.train_steps_fn:
+ self.train_progbar._num = self.train_steps_fn()
+
def on_train_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100))
@@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger):
self.eval_metrics = list(
self.eval_metrics) + ["normalized loss", "ppl"]
+ def on_eval_batch_begin(self, step, logs=None):
+ if step == 0 and self.eval_steps_fn:
+ self.eval_progbar._num = self.eval_steps_fn()
+
def on_eval_batch_end(self, step, logs=None):
logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
logs["ppl"] = np.exp(min(logs["loss"][0], 100))
@@ -104,7 +115,8 @@ def do_train(args):
]
# def dataloader
- train_loader, eval_loader = create_data_loader(args, device)
+ (train_loader, train_steps_fn), (
+ eval_loader, eval_steps_fn) = create_data_loader(args, device)
# define model
transformer = Transformer(
@@ -142,7 +154,12 @@ def do_train(args):
eval_freq=1,
save_freq=1,
save_dir=args.save_model,
- callbacks=[TrainCallback(args)])
+ callbacks=[
+ TrainCallback(
+ args,
+ train_steps_fn=train_steps_fn,
+ eval_steps_fn=eval_steps_fn)
+ ])
if __name__ == "__main__":
diff --git a/transformer/transformer.py b/examples/transformer/transformer.py
similarity index 99%
rename from transformer/transformer.py
rename to examples/transformer/transformer.py
index 9caf4b04a1a34c5e856a789fbded8a53e917a3da..30bb931d28c3b52467f484f4cb14b5d5601c76d9 100644
--- a/transformer/transformer.py
+++ b/examples/transformer/transformer.py
@@ -20,8 +20,8 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
-from model import Model, CrossEntropy, Loss
-from text import TransformerBeamSearchDecoder, DynamicDecode
+from hapi.model import Model, CrossEntropy, Loss
+from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
def position_encoding_init(n_position, d_pos_vec):
diff --git a/transformer/transformer.yaml b/examples/transformer/transformer.yaml
similarity index 100%
rename from transformer/transformer.yaml
rename to examples/transformer/transformer.yaml
diff --git a/transformer/utils/__init__.py b/examples/transformer/utils/__init__.py
similarity index 100%
rename from transformer/utils/__init__.py
rename to examples/transformer/utils/__init__.py
diff --git a/transformer/utils/check.py b/examples/transformer/utils/check.py
similarity index 100%
rename from transformer/utils/check.py
rename to examples/transformer/utils/check.py
diff --git a/transformer/utils/configure.py b/examples/transformer/utils/configure.py
similarity index 95%
rename from transformer/utils/configure.py
rename to examples/transformer/utils/configure.py
index 67e601282fee572518435eaed38a4ed8e26fc5f9..17dfaa53d8b44a68a2847c4bc1a1934384bb5f82 100644
--- a/transformer/utils/configure.py
+++ b/examples/transformer/utils/configure.py
@@ -195,13 +195,19 @@ class PDConfig(object):
"Whether to perform predicting.")
self.default_g.add_arg("do_eval", bool, False,
"Whether to perform evaluating.")
- self.default_g.add_arg("do_save_inference_model", bool, False,
- "Whether to perform model saving for inference.")
+ self.default_g.add_arg(
+ "do_save_inference_model", bool, False,
+ "Whether to perform model saving for inference.")
# NOTE: args for profiler
- self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)")
- self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)")
- self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)")
+ self.default_g.add_arg(
+ "is_profiler", int, 0,
+ "the switch of profiler tools. (used for benchmark)")
+ self.default_g.add_arg(
+ "profiler_path", str, './',
+ "the profiler output file path. (used for benchmark)")
+ self.default_g.add_arg("max_iter", int, 0,
+ "the max train batch num.(used for benchmark)")
self.parser = parser
diff --git a/hapi/callbacks.py b/hapi/callbacks.py
index f02eec1ac7b20fe3d5ec771493378b4e74cc3796..62d6402941d0ab0e8af1b3efb3dd77d8ad05604d 100644
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
@@ -215,13 +215,13 @@ class ProgBarLogger(Callback):
if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
- # if steps is not None, last step will update in on_epoch_end
- if self.steps and self.train_step < self.steps:
+ if self.steps is None or self.train_step < self.steps:
self._updates(logs, 'train')
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
- if self.verbose and ParallelEnv().local_rank == 0:
+ if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv(
+ ).local_rank == 0:
self._updates(logs, 'train')
def on_eval_begin(self, logs=None):
@@ -242,14 +242,14 @@ class ProgBarLogger(Callback):
if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
- # if steps is not None, last step will update in on_epoch_end
- if self.eval_steps and self.eval_step < self.eval_steps:
+ if self.eval_steps is None or self.eval_step < self.eval_steps:
self._updates(logs, 'eval')
def on_eval_end(self, logs=None):
logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0:
- self._updates(logs, 'eval')
+ if self.eval_step % self.log_freq != 0:
+ self._updates(logs, 'eval')
print('Eval samples: %d' % (self.evaled_samples))
diff --git a/hapi/model.py b/hapi/model.py
index b9dc4ca441e8c2531df633b946e5c4da30bffa44..cde4ba6040be334f3ba902413ebf6953e2f35140 100644
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -576,14 +576,15 @@ class DynamicGraphAdapter(object):
if labels is not None:
labels = [to_variable(l) for l in to_list(labels)]
if self._nranks > 1:
- outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
+ outputs = self.ddp_model.forward(
+ * [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses)
final_loss = self.ddp_model.scale_loss(final_loss)
final_loss.backward()
self.ddp_model.apply_collective_grads()
else:
- outputs = self.model.forward(*[to_variable(x) for x in inputs])
+ outputs = self.model.forward(* [to_variable(x) for x in inputs])
losses = self.model._loss_function(outputs, labels)
final_loss = fluid.layers.sum(losses)
final_loss.backward()
@@ -592,9 +593,9 @@ class DynamicGraphAdapter(object):
self.model.clear_gradients()
metrics = []
for metric in self.model._metrics:
- metric_outs = metric.add_metric_op(
- *(to_list(outputs) + to_list(labels)))
- m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+ metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+ labels)))
+ m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \
@@ -606,7 +607,7 @@ class DynamicGraphAdapter(object):
inputs = to_list(inputs)
if labels is not None:
labels = [to_variable(l) for l in to_list(labels)]
- outputs = self.model.forward(*[to_variable(x) for x in inputs])
+ outputs = self.model.forward(* [to_variable(x) for x in inputs])
if self.model._loss_function:
losses = self.model._loss_function(outputs, labels)
else:
@@ -632,9 +633,9 @@ class DynamicGraphAdapter(object):
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
- metric_outs = metric.add_metric_op(
- *(to_list(outputs) + to_list(labels)))
- m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+ metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
+ labels)))
+ m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
# To be consistent with static graph
@@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer):
do_eval = eval_loader is not None
self._test_dataloader = eval_loader
metrics_name = self._metrics_name()
- steps = len(train_loader) if hasattr(train_loader, '__len__') else None
+ steps = self._len_data_loader(train_loader)
cbks = config_callbacks(
callbacks,
model=self,
@@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable):
loader = eval_loader()
- eval_steps = len(loader) if hasattr(loader,
- '__len__') else None
+ eval_steps = self._len_data_loader(loader)
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics_name': metrics_name
@@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer):
if not isinstance(eval_loader, Iterable):
loader = eval_loader()
- eval_steps = len(loader) if hasattr(loader, '__len__') else None
+ eval_steps = self._len_data_loader(loader)
cbks.on_begin('eval',
{'steps': eval_steps,
'metrics_name': metrics_name})
@@ -1214,7 +1214,7 @@ class Model(fluid.dygraph.Layer):
mode,
metrics_name,
epoch=None):
- size = len(data_loader) if hasattr(data_loader, '__len__') else None
+ size = self._len_data_loader(data_loader)
logs = {
'steps': size,
'metrics_name': metrics_name,
@@ -1289,3 +1289,10 @@ class Model(fluid.dygraph.Layer):
for m in self._metrics:
metrics_name.extend(to_list(m.name()))
return metrics_name
+
+ def _len_data_loader(self, data_loader):
+ try:
+ steps = len(data_loader)
+ except Exception:
+ steps = None
+ return steps
diff --git a/hapi/text/text.py b/hapi/text/text.py
index e5be32bcb531b938c3cc8c21ec7caf2a4f40ee6e..ee74c516437a366e1dd91cde236346ecf2e1b787 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -238,8 +238,9 @@ class BasicLSTMCell(RNNCell):
self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh
- self._forget_bias = layers.fill_constant(
- [1], dtype=dtype, value=forget_bias)
+ # TODO(guosheng): find better way to resolve constants in __init__
+ self._forget_bias = layers.create_global_var(
+ shape=[1], dtype=dtype, value=forget_bias, persistable=True)
self._forget_bias.stop_gradient = False
self._dtype = dtype
self._input_size = input_size
@@ -817,7 +818,7 @@ class RNN(fluid.dygraph.Layer):
lambda x: fluid.layers.transpose(x, [1, 0] + list(
range(2, len(x.shape)))), inputs)
- if sequence_length:
+ if sequence_length is not None:
mask = fluid.layers.sequence_mask(
sequence_length,
maxlen=time_steps,
@@ -828,7 +829,7 @@ class RNN(fluid.dygraph.Layer):
inputs = map_structure(
lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
mask = fluid.layers.reverse(
- mask, axis=[0]) if sequence_length else None
+ mask, axis=[0]) if sequence_length is not None else None
states = initial_states
outputs = []
@@ -836,7 +837,7 @@ class RNN(fluid.dygraph.Layer):
step_inputs = map_structure(lambda x: x[i], inputs)
step_outputs, new_states = self.cell(step_inputs, states,
**kwargs)
- if sequence_length:
+ if sequence_length is not None:
new_states = map_structure(
partial(
_maybe_copy, step_mask=mask[i]),