diff --git a/ctr/index.html b/ctr/index.html
index c4a8b113376bcd9d27b4e3b011daa921dbfd0ae4..ff0c5d9b19ec046b61f7f38d6eb9e70dff33e1ec 100644
--- a/ctr/index.html
+++ b/ctr/index.html
@@ -40,11 +40,11 @@
-# CTR预估
+# 点击率预估
## 背景介绍
-CTR(Click-Through Rate)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率,
+CTR(Click-Through Rate,点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率,
通常被用来衡量一个在线广告系统的有效性。
当有多个广告位时,CTR 预估一般会作为排序的基准。
diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md
index bb1815c0087064730818f56150445bacc0919cd4..7a372e9bed262d2ee5bc8640a0f480b9ce34cd34 100644
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -18,9 +18,14 @@ For some machines, we also need to install libsndfile1. Details to be added.
```
cd data
python librispeech.py
+cat manifest.libri.train-* > manifest.libri.train-all
cd ..
```
+After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format.
+
+By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets.
+
More help for arguments:
```
@@ -32,13 +37,13 @@ python librispeech.py --help
For GPU Training:
```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
+CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all
```
For CPU Training:
```
-python train.py --trainer_count 8 --use_gpu False
+python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all
```
More help for arguments:
diff --git a/deep_speech_2/data/librispeech.py b/deep_speech_2/data/librispeech.py
index 838fee59786d244ccd0e9ea487911791c52c7cda..653caa9267b62aa8415a26be2143de874bb15e88 100644
--- a/deep_speech_2/data/librispeech.py
+++ b/deep_speech_2/data/librispeech.py
@@ -1,13 +1,14 @@
"""
- Download, unpack and create manifest for Librespeech dataset.
+ Download, unpack and create manifest json files for the Librespeech dataset.
- Manifest is a json file with each line containing one audio clip filepath,
- its transcription text string, and its duration. It servers as a unified
- interfance to organize different data sets.
+ A manifest is a json file summarizing filelist in a data set, with each line
+ containing the meta data (i.e. audio filepath, transcription text, audio
+ duration) of each audio file in the data set.
"""
import paddle.v2 as paddle
from paddle.v2.dataset.common import md5file
+import distutils.util
import os
import wget
import tarfile
@@ -27,7 +28,9 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
@@ -44,6 +47,13 @@ parser.add_argument(
default="manifest.libri",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+ "--full_download",
+ default="True",
+ type=distutils.util.strtobool,
+ help="Download all datasets for Librispeech."
+ " If False, only download a minimal requirement (test-clean, dev-clean"
+ " train-clean-100). (default: %(default)s)")
args = parser.parse_args()
@@ -57,7 +67,10 @@ def download(url, md5sum, target_dir):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
- assert md5file(filepath) == md5sum, "MD5 checksum failed."
+ if not md5file(filepath) == md5sum:
+ raise RuntimeError("MD5 checksum failed.")
+ else:
+ print("File exists, skip downloading. (%s)" % filepath)
return filepath
@@ -69,21 +82,17 @@ def unpack(filepath, target_dir):
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
- return target_dir
def create_manifest(data_dir, manifest_path):
"""
- Create a manifest file summarizing the dataset (list of filepath and meta
- data).
-
- Each line of the manifest contains one audio clip filepath, its
- transcription text string, and its duration. Manifest file servers as a
- unified interfance to organize data sets.
+ Create a manifest json file summarizing the data set, with each line
+ containing the meta data (i.e. audio filepath, transcription text, audio
+ duration) of each audio file within the data set.
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
- for subfolder, _, filelist in os.walk(data_dir):
+ for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt')
]
@@ -111,9 +120,16 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""
Download, unpack and create summmary manifest file.
"""
- filepath = download(url, md5sum, target_dir)
- unpacked_dir = unpack(filepath, target_dir)
- create_manifest(unpacked_dir, manifest_path)
+ if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+ # download
+ filepath = download(url, md5sum, target_dir)
+ # unpack
+ unpack(filepath, target_dir)
+ else:
+ print("Skip downloading and unpacking. Data already exists in %s." %
+ target_dir)
+ # create manifest json file
+ create_manifest(target_dir, manifest_path)
def main():
@@ -132,6 +148,27 @@ def main():
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")
+ if args.full_download:
+ prepare_dataset(
+ url=URL_TEST_OTHER,
+ md5sum=MD5_TEST_OTHER,
+ target_dir=os.path.join(args.target_dir, "test-other"),
+ manifest_path=args.manifest_prefix + ".test-other")
+ prepare_dataset(
+ url=URL_DEV_OTHER,
+ md5sum=MD5_DEV_OTHER,
+ target_dir=os.path.join(args.target_dir, "dev-other"),
+ manifest_path=args.manifest_prefix + ".dev-other")
+ prepare_dataset(
+ url=URL_TRAIN_CLEAN_360,
+ md5sum=MD5_TRAIN_CLEAN_360,
+ target_dir=os.path.join(args.target_dir, "train-clean-360"),
+ manifest_path=args.manifest_prefix + ".train-clean-360")
+ prepare_dataset(
+ url=URL_TRAIN_OTHER_500,
+ md5sum=MD5_TRAIN_OTHER_500,
+ target_dir=os.path.join(args.target_dir, "train-other-500"),
+ manifest_path=args.manifest_prefix + ".train-other-500")
if __name__ == '__main__':
diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py
index 55577b0d870d879859436fd78b03b8a2f738e265..eb9b56de7f325a507c00239b38b8bdb1dd985906 100644
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -11,6 +11,7 @@ import sys
from model import deep_speech2
from audio_data_utils import DataGenerator
import numpy as np
+import os
#TODO: add WER metric
@@ -78,6 +79,13 @@ parser.add_argument(
default='data/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
+parser.add_argument(
+ "--init_model_path",
+ default=None,
+ type=str,
+ help="If set None, the training will start from scratch. "
+ "Otherwise, the training will resume from "
+ "the existing model of this path. (default: %(default)s)")
args = parser.parse_args()
@@ -118,8 +126,14 @@ def train():
rnn_size=args.rnn_layer_size,
is_inference=False)
- # create parameters and optimizer
- parameters = paddle.parameters.create(cost)
+ # create/load parameters and optimizer
+ if args.init_model_path is None:
+ parameters = paddle.parameters.create(cost)
+ else:
+ if not os.path.isfile(args.init_model_path):
+ raise IOError("Invalid model!")
+ parameters = paddle.parameters.Parameters.from_tar(
+ gzip.open(args.init_model_path))
optimizer = paddle.optimizer.Adam(
learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
trainer = paddle.trainer.SGD(
diff --git a/image_classification/caffe2paddle/README.md b/image_classification/caffe2paddle/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c90e000186e974803494cd5d25df1fc71004c37b
--- /dev/null
+++ b/image_classification/caffe2paddle/README.md
@@ -0,0 +1,39 @@
+## 使用说明
+
+`caffe2paddle.py`提供了将Caffe训练的模型转换为PaddlePaddle可使用的模型的接口`ModelConverter`,其封装了图像领域常用的Convolution、BatchNorm等layer的转换函数,可以完成VGG、ResNet等常用模型的转换。模型转换的基本过程是:基于Caffe的Python API加载模型并依次获取每一个layer的信息,将其中的参数根据layer类型与PaddlePaddle适配后序列化保存(对于Pooling等无需训练的layer不做处理),输出可以直接为PaddlePaddle的Python API加载使用的模型文件。
+
+可以按如下方法使用`ModelConverter`接口:
+
+```python
+# 定义以下变量为相应的文件路径和文件名
+caffe_model_file = "./ResNet-50-deploy.prototxt" # Caffe网络配置文件的路径
+caffe_pretrained_file = "./ResNet-50-model.caffemodel" # Caffe模型文件的路径
+paddle_tar_name = "Paddle_ResNet50.tar.gz" # 输出的Paddle模型的文件名
+
+# 初始化,从指定文件加载模型
+converter = ModelConverter(caffe_model_file=caffe_model_file,
+ caffe_pretrained_file=caffe_pretrained_file,
+ paddle_tar_name=paddle_tar_name)
+# 进行模型转换
+converter.convert()
+```
+
+`caffe2paddle.py`中已提供以上步骤,修改其中文件相关变量的值后执行`python caffe2paddle.py`即可完成模型转换。此外,为辅助验证转换结果,`ModelConverter`中封装了使用Caffe API预测的接口`caffe_predict`,使用如下所示,将会打印按类别概率排序的(类别id, 概率)的列表:
+
+```python
+# img为图片路径,mean_file为图像均值文件的路径
+converter.caffe_predict(img="./cat.jpg", mean_file="./imagenet/ilsvrc_2012_mean.npy")
+```
+
+需要注意,在模型转换时会对layer的参数进行命名,这里默认使用PaddlePaddle中默认的layer和参数命名规则:以`wrap_name_default`中的值和该layer类型的调用计数构造layer name,并以此为前缀构造参数名,比如第一个InnerProduct层(相应转换函数说明见下方)的bias参数将被命名为`___fc_layer_0__.wbias`。
+
+```python
+# 对InnerProduct层的参数进行转换,使用name值构造对应layer的参数名
+# wrap_name_default设置默认name值为fc_layer
+@wrap_name_default("fc_layer")
+def convert_InnerProduct_layer(self, params, name=None)
+```
+
+为此,在验证和使用转换得到的模型时,编写PaddlePaddle网络配置无需指定layer name并且要保证和Caffe端模型使用同样的拓扑顺序,尤其是对于ResNet这种有分支的网络结构,要保证两分支在PaddlePaddle和Caffe中先后顺序一致,这样才能够使得模型参数正确加载。
+
+如果不希望使用默认的命名,并且在PaddlePaddle网络配置中指定了layer name,可以建立Caffe和PaddlePaddle网络配置间layer name对应关系的`dict`并在调用`ModelConverter.convert`时作为`name_map`的值传入,这样在命名保存layer中的参数时将使用相应的layer name,不受拓扑顺序的影响。另外这里只针对Caffe网络配置中Convolution、InnerProduct和BatchNorm类别的layer建立`name_map`即可(一方面,对于Pooling等无需训练的layer不需要保存,故这里没有提供转换接口;另一方面,对于Caffe中的Scale类别的layer,由于Caffe和PaddlePaddle在实现上的一些差别,PaddlePaddle中的batch_norm层是BatchNorm和Scale层的复合,故这里对Scale进行了特殊处理)。
diff --git a/image_classification/caffe2paddle/caffe2paddle.py b/image_classification/caffe2paddle/caffe2paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4011f281538d31d076f7f554d1dbb8a2ceb1d5a
--- /dev/null
+++ b/image_classification/caffe2paddle/caffe2paddle.py
@@ -0,0 +1,187 @@
+import os
+import struct
+import gzip
+import tarfile
+import cStringIO
+import numpy as np
+import cv2
+import caffe
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+
+
+class ModelConverter(object):
+ def __init__(self, caffe_model_file, caffe_pretrained_file,
+ paddle_tar_name):
+ self.net = caffe.Net(caffe_model_file, caffe_pretrained_file,
+ caffe.TEST)
+ self.tar_name = paddle_tar_name
+ self.params = dict()
+ self.pre_layer_name = ""
+ self.pre_layer_type = ""
+
+ def convert(self, name_map=None):
+ layer_dict = self.net.layer_dict
+ for layer_name in layer_dict.keys():
+ layer = layer_dict[layer_name]
+ layer_params = layer.blobs
+ layer_type = layer.type
+ if len(layer_params) > 0:
+ self.pre_layer_name = getattr(
+ self, "convert_" + layer_type + "_layer")(
+ layer_params,
+ name=None
+ if name_map == None else name_map.get(layer_name))
+ self.pre_layer_type = layer_type
+ with gzip.open(self.tar_name, 'w') as f:
+ self.to_tar(f)
+ return
+
+ def to_tar(self, f):
+ tar = tarfile.TarFile(fileobj=f, mode='w')
+ for param_name in self.params.keys():
+ param_conf, param_data = self.params[param_name]
+
+ confStr = param_conf.SerializeToString()
+ tarinfo = tarfile.TarInfo(name="%s.protobuf" % param_name)
+ tarinfo.size = len(confStr)
+ buf = cStringIO.StringIO(confStr)
+ buf.seek(0)
+ tar.addfile(tarinfo, fileobj=buf)
+
+ buf = cStringIO.StringIO()
+ self.serialize(param_data, buf)
+ tarinfo = tarfile.TarInfo(name=param_name)
+ buf.seek(0)
+ tarinfo.size = len(buf.getvalue())
+ tar.addfile(tarinfo, buf)
+
+ @staticmethod
+ def serialize(data, f):
+ f.write(struct.pack("IIQ", 0, 4, data.size))
+ f.write(data.tobytes())
+
+ @wrap_name_default("conv")
+ def convert_Convolution_layer(self, params, name=None):
+ for i in range(len(params)):
+ data = np.array(params[i].data)
+ if len(params) == 2:
+ suffix = "0" if i == 0 else "bias"
+ file_name = "_%s.w%s" % (name, suffix)
+ else:
+ file_name = "_%s.w%s" % (name, str(i))
+ param_conf = ParameterConfig()
+ param_conf.name = file_name
+ param_conf.size = reduce(lambda a, b: a * b, data.shape)
+ self.params[file_name] = (param_conf, data.flatten())
+
+ return name
+
+ @wrap_name_default("fc_layer")
+ def convert_InnerProduct_layer(self, params, name=None):
+ for i in range(len(params)):
+ data = np.array(params[i].data)
+ if len(params) == 2:
+ suffix = "0" if i == 0 else "bias"
+ file_name = "_%s.w%s" % (name, suffix)
+ else:
+ file_name = "_%s.w%s" % (name, str(i))
+ data = np.transpose(data)
+ param_conf = ParameterConfig()
+ param_conf.name = file_name
+ dims = list(data.shape)
+ if len(dims) < 2:
+ dims.insert(0, 1)
+ param_conf.size = reduce(lambda a, b: a * b, dims)
+ param_conf.dims.extend(dims)
+ self.params[file_name] = (param_conf, data.flatten())
+ return name
+
+ @wrap_name_default("batch_norm")
+ def convert_BatchNorm_layer(self, params, name=None):
+ scale = 1 / np.array(params[-1].data)[0] if np.array(
+ params[-1].data)[0] != 0 else 0
+ for i in range(2):
+ data = np.array(params[i].data) * scale
+ file_name = "_%s.w%s" % (name, str(i + 1))
+ param_conf = ParameterConfig()
+ param_conf.name = file_name
+ dims = list(data.shape)
+ assert len(dims) == 1
+ dims.insert(0, 1)
+ param_conf.size = reduce(lambda a, b: a * b, dims)
+ param_conf.dims.extend(dims)
+ self.params[file_name] = (param_conf, data.flatten())
+ return name
+
+ def convert_Scale_layer(self, params, name=None):
+ assert self.pre_layer_type == "BatchNorm"
+ name = self.pre_layer_name
+ for i in range(len(params)):
+ data = np.array(params[i].data)
+ suffix = "0" if i == 0 else "bias"
+ file_name = "_%s.w%s" % (name, suffix)
+ param_conf = ParameterConfig()
+ param_conf.name = file_name
+ dims = list(data.shape)
+ assert len(dims) == 1
+ dims.insert(0, 1)
+ param_conf.size = reduce(lambda a, b: a * b, dims)
+ if i == 1:
+ param_conf.dims.extend(dims)
+ self.params[file_name] = (param_conf, data.flatten())
+ return name
+
+ def caffe_predict(self,
+ img,
+ mean_file='./caffe/imagenet/ilsvrc_2012_mean.npy'):
+ net = self.net
+
+ net.blobs['data'].data[...] = load_image(img, mean_file=mean_file)
+ out = net.forward()
+
+ output_prob = net.blobs['prob'].data[0].flatten()
+ print zip(np.argsort(output_prob)[::-1], np.sort(output_prob)[::-1])
+
+
+def load_image(file, resize_size=256, crop_size=224, mean_file=None):
+ # load image
+ im = cv2.imread(file)
+ # resize
+ h, w = im.shape[:2]
+ h_new, w_new = resize_size, resize_size
+ if h > w:
+ h_new = resize_size * h / w
+ else:
+ w_new = resize_size * w / h
+ im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+ # crop
+ h, w = im.shape[:2]
+ h_start = (h - crop_size) / 2
+ w_start = (w - crop_size) / 2
+ h_end, w_end = h_start + crop_size, w_start + crop_size
+ im = im[h_start:h_end, w_start:w_end, :]
+ # transpose to CHW order
+ im = im.transpose((2, 0, 1))
+
+ if mean_file:
+ mu = np.load(mean_file)
+ mu = mu.mean(1).mean(1)
+ im = im - mu[:, None, None]
+ im = im / 255.0
+ return im
+
+
+if __name__ == "__main__":
+ caffe_model_file = "./ResNet-50-deploy.prototxt"
+ caffe_pretrained_file = "./ResNet-50-model.caffemodel"
+ paddle_tar_name = "Paddle_ResNet50.tar.gz"
+
+ converter = ModelConverter(
+ caffe_model_file=caffe_model_file,
+ caffe_pretrained_file=caffe_pretrained_file,
+ paddle_tar_name=paddle_tar_name)
+ converter.convert()
+
+ converter.caffe_predict("./cat.jpg",
+ "./caffe/imagenet/ilsvrc_2012_mean.npy")
diff --git a/nce_cost/README.md b/nce_cost/README.md
index a0990367ef8b03c70c29d285e22ef85907e1d0b7..fce8bdaf80501e5bed650e93efc6c438284031c9 100644
--- a/nce_cost/README.md
+++ b/nce_cost/README.md
@@ -1 +1,115 @@
-TBD
+# 噪声对比估计加速词向量训练
+## 背景介绍
+在自然语言处理领域中,通常使用特征向量来表示一个单词,但是如何使用准确的词向量来表示语义却是一个难点,详细内容可以在[词向量章节](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)中查阅到,原作者使用神经概率语言模型(Neural Probabilistic Language Model, NPLM)来训练词向量,尽管 NPLM 有优异的精度表现,但是相对于传统的 N-gram 统计模型,训练时间还是太漫长了\[[3](#参考文献)\]。常用的优化这个问题算法主要有两个:一个是 hierarchical-sigmoid \[[2](#参考文献)\] 另一个 噪声对比估计(Noise-contrastive estimation, NCE)\[[1](#参考文献)\]。为了克服这个问题本文引入了 NCE 方法。本文将以训练 NPLM 作为例子来讲述如何使用 NCE。
+
+## NCE 概览
+NCE 是一种快速对离散分布进行估计的方法,应用到本文中的问题:训练 NPLM 计算开销很大,原因是 softmax 函数计算时需要考虑每个类别的指数项,必须计算字典中的所有单词,而在一般语料集上面字典往往非常大\[[3](#参考文献)\],从而导致整个训练过程十分耗时。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比,NCE 不再使用复杂的二叉树来构造目标函数,而是采用相对简单的随机负采样,以大幅提升计算效率。
+
+
+假设已知具体的上下文 $h$,并且知道这个分布为 $P^h(w)$ ,并将从中抽样出来的数据作为正样例,而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布,默认为无偏的均匀分布。这里我们同时假设噪音样例 k 倍于数据样例,则训练数据被抽中的概率为\[[1](#参考文献)\]:
+
+$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$
+
+其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ,$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量,整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]:
+
+$$
+J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) } \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$
+$$
+ \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) } \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h))) \right]$$
+
+总体上来说,NCE 是通过构造逻辑回归(logistic regression),对正样例和负样例做二分类,对于每一个样本,将自身的预测词 label 作为正样例,同时采样出 $k$ 个其他词 label 作为负样例,从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 softmax 分类需要计算每个类别的分数,然后归一化得到概率,节约了大量的时间消耗。
+
+## 实验数据
+本文采用 Penn Treebank (PTB) 数据集([Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz))来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据,如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作,方便后期处理。语料语种为英文,共有42068句训练数据,3761句测试数据。
+
+## 网络结构
+N-gram 神经概率语言模型详细网络结构见图1:
+
+
+
+图1. 网络配置结构
+
+可以看到,模型主要分为如下几个部分构成:
+
+1. **输入层**:输入的 ptb 样本由原始的英文单词组成,将每个英文单词转换为字典中的 id 表示,使用唯一的 id 表示可以区分每个单词。
+
+2. **词向量层**:比起原先的 id 表示,词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵,将原先的 id 表示转换为固定维度的词向量表示。训练完成之后,词语之间的语义相似度可以使用词向量之间的距离来表示,语义越相似,距离越近。
+
+3. **词向量拼接层**:将词向量进行串联,并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。
+
+4. **全连接隐层**:将上一层获得的长向量输入到一层隐层的神经网络,输出特征向量。全连接的隐层可以增强网络的学习能力。
+
+5. **NCE层**:训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。
+
+
+## 训练阶段
+训练直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集,如果未包含,则自动下载。运行过程中,每1000个 iteration 会打印模型训练信息,主要包含训练损失,每个 pass 会计算测试数据集上的损失,并同时会保存最新的模型快照。在 PaddlePaddle 中有已经实现好的 NCE Layer,一些参数需要自行根据实际场景进行设计,可参考的调参方案如下:
+
+
+| 参数名 | 参数作用 | 介绍 |
+|:------ |:-------| :--------|
+| param\_attr / bias\_attr | 用来设置参数名字 | 可以方便后面预测阶段好来实现网络的参数共享,具体内容在下一个章节里会陈述。|
+| num\_neg\_samples | 参数负责控制对负样例的采样个数。 | 可以控制正负样本比例,这个值取值区间为 [1, 字典大小-1],负样本个数越多则整个模型的训练速度越慢,模型精度也会越高 |
+| neg\_distribution | 控制生成负样例标签的分布,默认是一个均匀分布。 | 可以自行控制负样本采样时各个类别的采样权重,比如希望正样例为“晴天”时,负样例“洪水”在训练时更被着重区分,则可以将“洪水”这个类别的采样权重增加。 |
+| act | 表示使用何种激活函数。 | 根据 NCE 的原理,这里应该使用 sigmoid 函数。 |
+
+
+具体代码实现如下:
+
+```python
+cost = paddle.layer.nce(
+ input=hidden_layer,
+ label=next_word,
+ num_classes=dict_size,
+ param_attr=paddle.attr.Param(name='nce_w'),
+ bias_attr=paddle.attr.Param(name='nce_b'),
+ act=paddle.activation.Sigmoid(),
+ num_neg_samples=25,
+ neg_distribution=None)
+```
+
+
+## 预测阶段
+预测直接运行` python infer.py `,程序首先会加载最新模型,然后按照 batch 大小依次进行预测,并打印预测结果。因为训练和预测计算逻辑不一样,预测阶段需要共享 NCE Layer 中的逻辑回归训练时得到的参数,所以要写一个推断层,推断层的参数为预先训练好的参数。
+
+具体实现推断层的方法:先是通过 `paddle.attr.Param` 方法获取参数值,然后使用 `paddle.layer.trans_full_matrix_projection` 对隐层输出向量 `hidden_layer` 做一个矩阵右乘,PaddlePaddle 会自行在模型中寻找相同参数名的参数并获取。右乘求和后得到类别向量,将类别向量输入 softmax 做一个归一操作,和为1,从而得到最后的类别概率分布。
+
+代码实现如下:
+
+```python
+with paddle.layer.mixed(
+ size=dict_size,
+ act=paddle.activation.Softmax(),
+ bias_attr=paddle.attr.Param(name='nce_b')) as prediction:
+ prediction += paddle.layer.trans_full_matrix_projection(
+ input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w'))
+```
+
+预测的输出形式为:
+
+```
+--------------------------
+No.68 Input: '
for possible
+Ground Truth Output:
+Predict Output:
+
+--------------------------
+No.69 Input: for possible
+Ground Truth Output: on
+Predict Output:
+
+--------------------------
+No.70 Input: for possible on
+Ground Truth Output: the
+Predict Output: the
+
+```
+
+每一个短线表示一次的预测,第二行显示第几条测试样例,并给出输入的4个单词,第三行为真实的标签,第四行为预测的标签。
+
+## 参考文献
+1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273.
+
+2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
+
+3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758.
diff --git a/nce_cost/images/network_conf.png b/nce_cost/images/network_conf.png
new file mode 100644
index 0000000000000000000000000000000000000000..749f8a365db1e1c18d829a460de7c45b27892d19
Binary files /dev/null and b/nce_cost/images/network_conf.png differ
diff --git a/nce_cost/infer.py b/nce_cost/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e3aef45fc02ac008caa7102836ac47915be1fc
--- /dev/null
+++ b/nce_cost/infer.py
@@ -0,0 +1,70 @@
+# -*- encoding:utf-8 -*-
+import numpy as np
+import glob
+import gzip
+import paddle.v2 as paddle
+from nce_conf import network_conf
+
+
+def main():
+ paddle.init(use_gpu=False, trainer_count=1)
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+
+ prediction_layer = network_conf(
+ is_train=False,
+ hidden_size=128,
+ embedding_size=512,
+ dict_size=dict_size)
+
+ models_list = glob.glob('./models/*')
+ models_list = sorted(models_list)
+
+ with gzip.open(models_list[-1], 'r') as f:
+ parameters = paddle.parameters.Parameters.from_tar(f)
+
+ idx_word_dict = dict((v, k) for k, v in word_dict.items())
+ batch_size = 64
+ batch_ins = []
+ ins_iter = paddle.dataset.imikolov.test(word_dict, 5)
+
+ infer_data = []
+ infer_data_label = []
+ for item in paddle.dataset.imikolov.test(word_dict, 5)():
+ infer_data.append((item[:4]))
+ infer_data_label.append(item[4])
+ # Choose 100 samples from the test set to show how to infer.
+ if len(infer_data_label) == 100:
+ break
+
+ feeding = {
+ 'firstw': 0,
+ 'secondw': 1,
+ 'thirdw': 2,
+ 'fourthw': 3,
+ 'fifthw': 4
+ }
+
+ predictions = paddle.infer(
+ output_layer=prediction_layer,
+ parameters=parameters,
+ input=infer_data,
+ feeding=feeding,
+ field=['value'])
+
+ for i, (prob, data,
+ label) in enumerate(zip(predictions, infer_data, infer_data_label)):
+ print '--------------------------'
+ print "No.%d Input: " % (i+1) + \
+ idx_word_dict[data[0]] + ' ' + \
+ idx_word_dict[data[1]] + ' ' + \
+ idx_word_dict[data[2]] + ' ' + \
+ idx_word_dict[data[3]]
+ print 'Ground Truth Output: ' + idx_word_dict[label]
+ print 'Predict Output: ' + idx_word_dict[prob.argsort(
+ kind='heapsort', axis=0)[-1]]
+ print
+
+
+if __name__ == '__main__':
+ main()
diff --git a/nce_cost/nce_conf.py b/nce_cost/nce_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..962a9ccc80906bc2272245d0e297142397ffb024
--- /dev/null
+++ b/nce_cost/nce_conf.py
@@ -0,0 +1,61 @@
+# -*- encoding:utf-8 -*-
+import math
+import paddle.v2 as paddle
+
+
+def network_conf(hidden_size, embedding_size, dict_size, is_train):
+
+ first_word = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+ second_word = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+ third_word = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+ fourth_word = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+ next_word = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+ embed_param_attr = paddle.attr.Param(
+ name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+ first_embedding = paddle.layer.embedding(
+ input=first_word, size=embedding_size, param_attr=embed_param_attr)
+ second_embedding = paddle.layer.embedding(
+ input=second_word, size=embedding_size, param_attr=embed_param_attr)
+ third_embedding = paddle.layer.embedding(
+ input=third_word, size=embedding_size, param_attr=embed_param_attr)
+ fourth_embedding = paddle.layer.embedding(
+ input=fourth_word, size=embedding_size, param_attr=embed_param_attr)
+
+ context_embedding = paddle.layer.concat(input=[
+ first_embedding, second_embedding, third_embedding, fourth_embedding
+ ])
+
+ hidden_layer = paddle.layer.fc(
+ input=context_embedding,
+ size=hidden_size,
+ act=paddle.activation.Tanh(),
+ bias_attr=paddle.attr.Param(learning_rate=1),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embedding_size * 8), learning_rate=1))
+
+ if is_train == True:
+ cost = paddle.layer.nce(
+ input=hidden_layer,
+ label=next_word,
+ num_classes=dict_size,
+ param_attr=paddle.attr.Param(name='nce_w'),
+ bias_attr=paddle.attr.Param(name='nce_b'),
+ act=paddle.activation.Sigmoid(),
+ num_neg_samples=25,
+ neg_distribution=None)
+ return cost
+ else:
+ with paddle.layer.mixed(
+ size=dict_size,
+ act=paddle.activation.Softmax(),
+ bias_attr=paddle.attr.Param(name='nce_b')) as prediction:
+ prediction += paddle.layer.trans_full_matrix_projection(
+ input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w'))
+
+ return prediction
diff --git a/nce_cost/train.py b/nce_cost/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b437c1dd9bfc89fd03598b9a4201693c3074d7
--- /dev/null
+++ b/nce_cost/train.py
@@ -0,0 +1,52 @@
+# -*- encoding:utf-8 -*-
+import paddle.v2 as paddle
+import gzip
+
+from nce_conf import network_conf
+
+
+def main():
+ paddle.init(use_gpu=False, trainer_count=1)
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+
+ cost = network_conf(
+ is_train=True, hidden_size=128, embedding_size=512, dict_size=dict_size)
+
+ parameters = paddle.parameters.create(cost)
+ adagrad = paddle.optimizer.Adam(learning_rate=1e-4)
+ trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+
+ def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 1000 == 0:
+ print "Pass %d, Batch %d, Cost %f" % (
+ event.pass_id, event.batch_id, event.cost)
+
+ if isinstance(event, paddle.event.EndPass):
+ result = trainer.test(
+ paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64))
+ print "Test here.. Pass %d, Cost %f" % (event.pass_id, result.cost)
+
+ model_name = "./models/model_pass_%05d.tar.gz" % event.pass_id
+ print "Save model into %s ..." % model_name
+ with gzip.open(model_name, 'w') as f:
+ parameters.to_tar(f)
+
+ feeding = {
+ 'firstw': 0,
+ 'secondw': 1,
+ 'thirdw': 2,
+ 'fourthw': 3,
+ 'fifthw': 4
+ }
+
+ trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64),
+ num_passes=1000,
+ event_handler=event_handler,
+ feeding=feeding)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md
index 38361bbfbc3e029de872eba967a17453c5e7dac1..a54b715102574dae1b619997a1ed7a2bfc14131c 100644
--- a/nmt_without_attention/README.md
+++ b/nmt_without_attention/README.md
@@ -91,11 +91,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08
```python
#### Decoder
encoder_last = paddle.layer.last_seq(input=encoded_vector)
-with paddle.layer.mixed(
+encoder_last_projected = paddle.layer.mixed(
size=decoder_size,
- act=paddle.activation.Tanh()) as encoder_last_projected:
- encoder_last_projected += paddle.layer.full_matrix_projection(
- input=encoder_last)
+ act=paddle.activation.Tanh(),
+ input=paddle.layer.full_matrix_projection(input=encoder_last))
+
# gru step
def gru_decoder_without_attention(enc_vec, current_word):
'''
@@ -112,10 +112,12 @@ def gru_decoder_without_attention(enc_vec, current_word):
context = paddle.layer.last_seq(input=enc_vec)
- with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
- decoder_inputs +=paddle.layer.full_matrix_projection(input=context)
- decoder_inputs += paddle.layer.full_matrix_projection(
- input=current_word)
+ decoder_inputs = paddle.layer.mixed(
+ size=decoder_size * 3,
+ input=[
+ paddle.layer.full_matrix_projection(input=context),
+ paddle.layer.full_matrix_projection(input=current_word)
+ ])
gru_step = paddle.layer.gru_step(
name='gru_decoder',
@@ -125,24 +127,24 @@ def gru_decoder_without_attention(enc_vec, current_word):
output_mem=decoder_mem,
size=decoder_size)
- with paddle.layer.mixed(
- size=target_dict_dim,
- bias_attr=True,
- act=paddle.activation.Softmax()) as out:
- out += paddle.layer.full_matrix_projection(input=gru_step)
+ out = paddle.layer.mixed(
+ size=target_dict_dim,
+ bias_attr=True,
+ act=paddle.activation.Softmax(),
+ input=paddle.layer.full_matrix_projection(input=gru_step))
return out
```
在模型训练和测试阶段,解码器的行为有很大的不同:
- **训练阶段**:目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`,函数`recurrent_group()`循环调用单步逻辑执行,最后计算目标翻译与实际解码的差异cost并返回;
-- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
训练和生成的逻辑分别实现在如下的`if-else`条件分支中:
```python
decoder_group_name = "decoder_group"
-group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
@@ -166,7 +168,7 @@ if not generating:
return cost
else:
- trg_embedding = paddle.layer.GeneratedInputV2(
+ trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html
index d749ff5722aa4144743fdca45f2ac0418c9db0b3..35177ee5a679fe4f826dfd219721ef2e36b7df83 100644
--- a/nmt_without_attention/index.html
+++ b/nmt_without_attention/index.html
@@ -133,11 +133,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08
```python
#### Decoder
encoder_last = paddle.layer.last_seq(input=encoded_vector)
-with paddle.layer.mixed(
+encoder_last_projected = paddle.layer.mixed(
size=decoder_size,
- act=paddle.activation.Tanh()) as encoder_last_projected:
- encoder_last_projected += paddle.layer.full_matrix_projection(
- input=encoder_last)
+ act=paddle.activation.Tanh(),
+ input=paddle.layer.full_matrix_projection(input=encoder_last))
+
# gru step
def gru_decoder_without_attention(enc_vec, current_word):
'''
@@ -154,10 +154,12 @@ def gru_decoder_without_attention(enc_vec, current_word):
context = paddle.layer.last_seq(input=enc_vec)
- with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
- decoder_inputs +=paddle.layer.full_matrix_projection(input=context)
- decoder_inputs += paddle.layer.full_matrix_projection(
- input=current_word)
+ decoder_inputs = paddle.layer.mixed(
+ size=decoder_size * 3,
+ input=[
+ paddle.layer.full_matrix_projection(input=context),
+ paddle.layer.full_matrix_projection(input=current_word)
+ ])
gru_step = paddle.layer.gru_step(
name='gru_decoder',
@@ -167,24 +169,24 @@ def gru_decoder_without_attention(enc_vec, current_word):
output_mem=decoder_mem,
size=decoder_size)
- with paddle.layer.mixed(
- size=target_dict_dim,
- bias_attr=True,
- act=paddle.activation.Softmax()) as out:
- out += paddle.layer.full_matrix_projection(input=gru_step)
+ out = paddle.layer.mixed(
+ size=target_dict_dim,
+ bias_attr=True,
+ act=paddle.activation.Softmax(),
+ input=paddle.layer.full_matrix_projection(input=gru_step))
return out
```
在模型训练和测试阶段,解码器的行为有很大的不同:
- **训练阶段**:目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`,函数`recurrent_group()`循环调用单步逻辑执行,最后计算目标翻译与实际解码的差异cost并返回;
-- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
训练和生成的逻辑分别实现在如下的`if-else`条件分支中:
```python
decoder_group_name = "decoder_group"
-group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
@@ -208,7 +210,7 @@ if not generating:
return cost
else:
- trg_embedding = paddle.layer.GeneratedInputV2(
+ trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py
index e5a4e1b602226da802c5903d83c0d963ae37bd44..5a61b525e67f7d07f66ae8cc5064c0244bc0b6f3 100644
--- a/nmt_without_attention/nmt_without_attention.py
+++ b/nmt_without_attention/nmt_without_attention.py
@@ -16,7 +16,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
'''
Define the network structure of NMT, including encoder and decoder.
- :param source_dict_dim: size of source dictionary
+ :param source_dict_dim: size of source dictionary
:type source_dict_dim : int
:param target_dict_dim: size of target dictionary
:type target_dict_dim: int
@@ -41,11 +41,11 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
return_seq=True)
#### Decoder
encoder_last = paddle.layer.last_seq(input=encoded_vector)
- with paddle.layer.mixed(
- size=decoder_size,
- act=paddle.activation.Tanh()) as encoder_last_projected:
- encoder_last_projected += paddle.layer.full_matrix_projection(
- input=encoder_last)
+ encoder_last_projected = paddle.layer.mixed(
+ size=decoder_size,
+ act=paddle.activation.Tanh(),
+ input=paddle.layer.full_matrix_projection(input=encoder_last))
+
# gru step
def gru_decoder_without_attention(enc_vec, current_word):
'''
@@ -63,10 +63,12 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
context = paddle.layer.last_seq(input=enc_vec)
- with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
- decoder_inputs += paddle.layer.full_matrix_projection(input=context)
- decoder_inputs += paddle.layer.full_matrix_projection(
- input=current_word)
+ decoder_inputs = paddle.layer.mixed(
+ size=decoder_size * 3,
+ input=[
+ paddle.layer.full_matrix_projection(input=context),
+ paddle.layer.full_matrix_projection(input=current_word)
+ ])
gru_step = paddle.layer.gru_step(
name='gru_decoder',
@@ -76,15 +78,15 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
output_mem=decoder_mem,
size=decoder_size)
- with paddle.layer.mixed(
- size=target_dict_dim,
- bias_attr=True,
- act=paddle.activation.Softmax()) as out:
- out += paddle.layer.full_matrix_projection(input=gru_step)
+ out = paddle.layer.mixed(
+ size=target_dict_dim,
+ bias_attr=True,
+ act=paddle.activation.Softmax(),
+ input=paddle.layer.full_matrix_projection(input=gru_step))
return out
decoder_group_name = "decoder_group"
- group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+ group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_inputs = [group_input1]
if not generating:
@@ -109,7 +111,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
return cost
else:
- trg_embedding = paddle.layer.GeneratedInputV2(
+ trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
@@ -194,7 +196,7 @@ def generate(source_dict_dim, target_dict_dim, init_models_path):
beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
with gzip.open(init_models_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f)
- # prob is the prediction probabilities, and id is the prediction word.
+ # prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
@@ -244,10 +246,10 @@ def main():
target_language_dict_dim = 30000
if generating:
- # shoud pass the right generated model's path here
+ # modify this path to speicify a trained model.
init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz'
if not os.path.exists(init_models_path):
- print "Cannot find models for generation"
+ print "trained model cannot be found."
exit(1)
generate(source_language_dict_dim, target_language_dict_dim,
init_models_path)