Merge branch 'develop' of https://github.com/PaddlePaddle/models into ds2

7fb1fddb · dangqingqing · 1738700e · d67d362c · 7fb1fddb · 7fb1fddb
14 changed file
--- a/ctr/index.html
+++ b/ctr/index.html
@@ -40,11 +40,11 @@

 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
-# CTR预估
+# 点击率预估

 ## 背景介绍

-CTR(Click-Through Rate)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
 通常被用来衡量一个在线广告系统的有效性。

 当有多个广告位时，CTR 预估一般会作为排序的基准。

--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -18,9 +18,14 @@ For some machines, we also need to install libsndfile1. Details to be added.
 ```
 cd data
 python librispeech.py
+cat manifest.libri.train-* > manifest.libri.train-all
 cd ..
 ```

+After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format.
+
+By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets.
+
 More help for arguments:

 ```
@@ -32,13 +37,13 @@ python librispeech.py --help
 For GPU Training:

 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
+CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all
 ```

 For CPU Training:

 ```
-python train.py --trainer_count 8 --use_gpu False
+python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all
 ```

 More help for arguments:

--- a/deep_speech_2/data/librispeech.py
+++ b/deep_speech_2/data/librispeech.py
 """
-   Download, unpack and create manifest for Librespeech dataset.
+    Download, unpack and create manifest json files for the Librespeech dataset.

-   Manifest is a json file with each line containing one audio clip filepath,
-   its transcription text string, and its duration. It servers as a unified
-   interfance to organize different data sets.
+    A manifest is a json file summarizing filelist in a data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file in the data set.
 """

 import paddle.v2 as paddle
 from paddle.v2.dataset.common import md5file
+import distutils.util
 import os
 import wget
 import tarfile
@@ -27,7 +28,9 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
 URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

 MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
 MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
 MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
 MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
 MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
@@ -44,6 +47,13 @@ parser.add_argument(
    default="manifest.libri",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--full_download",
+    default="True",
+    type=distutils.util.strtobool,
+    help="Download all datasets for Librispeech."
+    " If False, only download a minimal requirement (test-clean, dev-clean"
+    " train-clean-100). (default: %(default)s)")
 args = parser.parse_args()


@@ -57,7 +67,10 @@ def download(url, md5sum, target_dir):
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
-        assert md5file(filepath) == md5sum, "MD5 checksum failed."
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
    return filepath


@@ -69,21 +82,17 @@ def unpack(filepath, target_dir):
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
-    return target_dir


 def create_manifest(data_dir, manifest_path):
    """
-    Create a manifest file summarizing the dataset (list of filepath and meta
-    data).
-
-    Each line of the manifest contains one audio clip filepath, its
-    transcription text string, and its duration. Manifest file servers as a
-    unified interfance to organize data sets.
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
-    for subfolder, _, filelist in os.walk(data_dir):
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
@@ -111,9 +120,16 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """
    Download, unpack and create summmary manifest file.
    """
-    filepath = download(url, md5sum, target_dir)
-    unpacked_dir = unpack(filepath, target_dir)
-    create_manifest(unpacked_dir, manifest_path)
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)


 def main():
@@ -132,6 +148,27 @@ def main():
        md5sum=MD5_TRAIN_CLEAN_100,
        target_dir=os.path.join(args.target_dir, "train-clean-100"),
        manifest_path=args.manifest_prefix + ".train-clean-100")
+    if args.full_download:
+        prepare_dataset(
+            url=URL_TEST_OTHER,
+            md5sum=MD5_TEST_OTHER,
+            target_dir=os.path.join(args.target_dir, "test-other"),
+            manifest_path=args.manifest_prefix + ".test-other")
+        prepare_dataset(
+            url=URL_DEV_OTHER,
+            md5sum=MD5_DEV_OTHER,
+            target_dir=os.path.join(args.target_dir, "dev-other"),
+            manifest_path=args.manifest_prefix + ".dev-other")
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_360,
+            md5sum=MD5_TRAIN_CLEAN_360,
+            target_dir=os.path.join(args.target_dir, "train-clean-360"),
+            manifest_path=args.manifest_prefix + ".train-clean-360")
+        prepare_dataset(
+            url=URL_TRAIN_OTHER_500,
+            md5sum=MD5_TRAIN_OTHER_500,
+            target_dir=os.path.join(args.target_dir, "train-other-500"),
+            manifest_path=args.manifest_prefix + ".train-other-500")


 if __name__ == '__main__':

--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -11,6 +11,7 @@ import sys
 from model import deep_speech2
 from audio_data_utils import DataGenerator
 import numpy as np
+import os

 #TODO: add WER metric

@@ -78,6 +79,13 @@ parser.add_argument(
    default='data/eng_vocab.txt',
    type=str,
    help="Vocabulary filepath. (default: %(default)s)")
+parser.add_argument(
+    "--init_model_path",
+    default=None,
+    type=str,
+    help="If set None, the training will start from scratch. "
+    "Otherwise, the training will resume from "
+    "the existing model of this path. (default: %(default)s)")
 args = parser.parse_args()


@@ -118,8 +126,14 @@ def train():
        rnn_size=args.rnn_layer_size,
        is_inference=False)

-    # create parameters and optimizer
-    parameters = paddle.parameters.create(cost)
+    # create/load parameters and optimizer
+    if args.init_model_path is None:
+        parameters = paddle.parameters.create(cost)
+    else:
+        if not os.path.isfile(args.init_model_path):
+            raise IOError("Invalid model!")
+        parameters = paddle.parameters.Parameters.from_tar(
+            gzip.open(args.init_model_path))
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
    trainer = paddle.trainer.SGD(

--- a/image_classification/caffe2paddle/README.md
+++ b/image_classification/caffe2paddle/README.md
+## 使用说明
+
+`caffe2paddle.py`提供了将Caffe训练的模型转换为PaddlePaddle可使用的模型的接口`ModelConverter`，其封装了图像领域常用的Convolution、BatchNorm等layer的转换函数，可以完成VGG、ResNet等常用模型的转换。模型转换的基本过程是：基于Caffe的Python API加载模型并依次获取每一个layer的信息，将其中的参数根据layer类型与PaddlePaddle适配后序列化保存（对于Pooling等无需训练的layer不做处理），输出可以直接为PaddlePaddle的Python API加载使用的模型文件。
+
+可以按如下方法使用`ModelConverter`接口：
+
+```python
+# 定义以下变量为相应的文件路径和文件名
+caffe_model_file = "./ResNet-50-deploy.prototxt"        # Caffe网络配置文件的路径
+caffe_pretrained_file = "./ResNet-50-model.caffemodel"  # Caffe模型文件的路径
+paddle_tar_name = "Paddle_ResNet50.tar.gz"              # 输出的Paddle模型的文件名
+
+# 初始化，从指定文件加载模型
+converter = ModelConverter(caffe_model_file=caffe_model_file,
+                           caffe_pretrained_file=caffe_pretrained_file,
+                           paddle_tar_name=paddle_tar_name)
+# 进行模型转换
+converter.convert()
+```
+
+`caffe2paddle.py`中已提供以上步骤，修改其中文件相关变量的值后执行`python caffe2paddle.py`即可完成模型转换。此外，为辅助验证转换结果，`ModelConverter`中封装了使用Caffe API预测的接口`caffe_predict`，使用如下所示，将会打印按类别概率排序的(类别id, 概率)的列表:
+
+```python
+# img为图片路径，mean_file为图像均值文件的路径
+converter.caffe_predict(img="./cat.jpg", mean_file="./imagenet/ilsvrc_2012_mean.npy")
+```
+
+需要注意，在模型转换时会对layer的参数进行命名，这里默认使用PaddlePaddle中默认的layer和参数命名规则：以`wrap_name_default`中的值和该layer类型的调用计数构造layer name，并以此为前缀构造参数名，比如第一个InnerProduct层（相应转换函数说明见下方）的bias参数将被命名为`___fc_layer_0__.wbias`。
+
+```python
+# 对InnerProduct层的参数进行转换，使用name值构造对应layer的参数名
+# wrap_name_default设置默认name值为fc_layer
+@wrap_name_default("fc_layer")
+def convert_InnerProduct_layer(self, params, name=None)
+```
+
+为此，在验证和使用转换得到的模型时，编写PaddlePaddle网络配置无需指定layer name并且要保证和Caffe端模型使用同样的拓扑顺序，尤其是对于ResNet这种有分支的网络结构，要保证两分支在PaddlePaddle和Caffe中先后顺序一致，这样才能够使得模型参数正确加载。
+
+如果不希望使用默认的命名，并且在PaddlePaddle网络配置中指定了layer name，可以建立Caffe和PaddlePaddle网络配置间layer name对应关系的`dict`并在调用`ModelConverter.convert`时作为`name_map`的值传入，这样在命名保存layer中的参数时将使用相应的layer name，不受拓扑顺序的影响。另外这里只针对Caffe网络配置中Convolution、InnerProduct和BatchNorm类别的layer建立`name_map`即可（一方面，对于Pooling等无需训练的layer不需要保存，故这里没有提供转换接口；另一方面，对于Caffe中的Scale类别的layer，由于Caffe和PaddlePaddle在实现上的一些差别，PaddlePaddle中的batch_norm层是BatchNorm和Scale层的复合，故这里对Scale进行了特殊处理）。
--- a/image_classification/caffe2paddle/caffe2paddle.py
+++ b/image_classification/caffe2paddle/caffe2paddle.py
+import os
+import struct
+import gzip
+import tarfile
+import cStringIO
+import numpy as np
+import cv2
+import caffe
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+
+
+class ModelConverter(object):
+    def __init__(self, caffe_model_file, caffe_pretrained_file,
+                 paddle_tar_name):
+        self.net = caffe.Net(caffe_model_file, caffe_pretrained_file,
+                             caffe.TEST)
+        self.tar_name = paddle_tar_name
+        self.params = dict()
+        self.pre_layer_name = ""
+        self.pre_layer_type = ""
+
+    def convert(self, name_map=None):
+        layer_dict = self.net.layer_dict
+        for layer_name in layer_dict.keys():
+            layer = layer_dict[layer_name]
+            layer_params = layer.blobs
+            layer_type = layer.type
+            if len(layer_params) > 0:
+                self.pre_layer_name = getattr(
+                    self, "convert_" + layer_type + "_layer")(
+                        layer_params,
+                        name=None
+                        if name_map == None else name_map.get(layer_name))
+            self.pre_layer_type = layer_type
+        with gzip.open(self.tar_name, 'w') as f:
+            self.to_tar(f)
+        return
+
+    def to_tar(self, f):
+        tar = tarfile.TarFile(fileobj=f, mode='w')
+        for param_name in self.params.keys():
+            param_conf, param_data = self.params[param_name]
+
+            confStr = param_conf.SerializeToString()
+            tarinfo = tarfile.TarInfo(name="%s.protobuf" % param_name)
+            tarinfo.size = len(confStr)
+            buf = cStringIO.StringIO(confStr)
+            buf.seek(0)
+            tar.addfile(tarinfo, fileobj=buf)
+
+            buf = cStringIO.StringIO()
+            self.serialize(param_data, buf)
+            tarinfo = tarfile.TarInfo(name=param_name)
+            buf.seek(0)
+            tarinfo.size = len(buf.getvalue())
+            tar.addfile(tarinfo, buf)
+
+    @staticmethod
+    def serialize(data, f):
+        f.write(struct.pack("IIQ", 0, 4, data.size))
+        f.write(data.tobytes())
+
+    @wrap_name_default("conv")
+    def convert_Convolution_layer(self, params, name=None):
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            if len(params) == 2:
+                suffix = "0" if i == 0 else "bias"
+                file_name = "_%s.w%s" % (name, suffix)
+            else:
+                file_name = "_%s.w%s" % (name, str(i))
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            param_conf.size = reduce(lambda a, b: a * b, data.shape)
+            self.params[file_name] = (param_conf, data.flatten())
+
+        return name
+
+    @wrap_name_default("fc_layer")
+    def convert_InnerProduct_layer(self, params, name=None):
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            if len(params) == 2:
+                suffix = "0" if i == 0 else "bias"
+                file_name = "_%s.w%s" % (name, suffix)
+            else:
+                file_name = "_%s.w%s" % (name, str(i))
+            data = np.transpose(data)
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            if len(dims) < 2:
+                dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+
+    @wrap_name_default("batch_norm")
+    def convert_BatchNorm_layer(self, params, name=None):
+        scale = 1 / np.array(params[-1].data)[0] if np.array(
+            params[-1].data)[0] != 0 else 0
+        for i in range(2):
+            data = np.array(params[i].data) * scale
+            file_name = "_%s.w%s" % (name, str(i + 1))
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            assert len(dims) == 1
+            dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+
+    def convert_Scale_layer(self, params, name=None):
+        assert self.pre_layer_type == "BatchNorm"
+        name = self.pre_layer_name
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            suffix = "0" if i == 0 else "bias"
+            file_name = "_%s.w%s" % (name, suffix)
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            assert len(dims) == 1
+            dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            if i == 1:
+                param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+
+    def caffe_predict(self,
+                      img,
+                      mean_file='./caffe/imagenet/ilsvrc_2012_mean.npy'):
+        net = self.net
+
+        net.blobs['data'].data[...] = load_image(img, mean_file=mean_file)
+        out = net.forward()
+
+        output_prob = net.blobs['prob'].data[0].flatten()
+        print zip(np.argsort(output_prob)[::-1], np.sort(output_prob)[::-1])
+
+
+def load_image(file, resize_size=256, crop_size=224, mean_file=None):
+    # load image
+    im = cv2.imread(file)
+    # resize
+    h, w = im.shape[:2]
+    h_new, w_new = resize_size, resize_size
+    if h > w:
+        h_new = resize_size * h / w
+    else:
+        w_new = resize_size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    # crop
+    h, w = im.shape[:2]
+    h_start = (h - crop_size) / 2
+    w_start = (w - crop_size) / 2
+    h_end, w_end = h_start + crop_size, w_start + crop_size
+    im = im[h_start:h_end, w_start:w_end, :]
+    # transpose to CHW order
+    im = im.transpose((2, 0, 1))
+
+    if mean_file:
+        mu = np.load(mean_file)
+        mu = mu.mean(1).mean(1)
+        im = im - mu[:, None, None]
+    im = im / 255.0
+    return im
+
+
+if __name__ == "__main__":
+    caffe_model_file = "./ResNet-50-deploy.prototxt"
+    caffe_pretrained_file = "./ResNet-50-model.caffemodel"
+    paddle_tar_name = "Paddle_ResNet50.tar.gz"
+
+    converter = ModelConverter(
+        caffe_model_file=caffe_model_file,
+        caffe_pretrained_file=caffe_pretrained_file,
+        paddle_tar_name=paddle_tar_name)
+    converter.convert()
+
+    converter.caffe_predict("./cat.jpg",
+                            "./caffe/imagenet/ilsvrc_2012_mean.npy")
--- a/nce_cost/README.md
+++ b/nce_cost/README.md
-TBD
+# 噪声对比估计加速词向量训练
+## 背景介绍
+在自然语言处理领域中，通常使用特征向量来表示一个单词，但是如何使用准确的词向量来表示语义却是一个难点，详细内容可以在[词向量章节](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)中查阅到，原作者使用神经概率语言模型（Neural Probabilistic Language Model, NPLM）来训练词向量，尽管 NPLM 有优异的精度表现，但是相对于传统的 N-gram 统计模型，训练时间还是太漫长了\[[3](#参考文献)\]。常用的优化这个问题算法主要有两个：一个是 hierarchical-sigmoid \[[2](#参考文献)\] 另一个 噪声对比估计（Noise-contrastive estimation, NCE）\[[1](#参考文献)\]。为了克服这个问题本文引入了 NCE 方法。本文将以训练 NPLM 作为例子来讲述如何使用 NCE。
+
+## NCE 概览
+NCE 是一种快速对离散分布进行估计的方法，应用到本文中的问题：训练 NPLM 计算开销很大，原因是 softmax 函数计算时需要考虑每个类别的指数项，必须计算字典中的所有单词，而在一般语料集上面字典往往非常大\[[3](#参考文献)\]，从而导致整个训练过程十分耗时。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比，NCE 不再使用复杂的二叉树来构造目标函数，而是采用相对简单的随机负采样，以大幅提升计算效率。
+
+
+假设已知具体的上下文 $h$，并且知道这个分布为 $P^h(w)$ ，并将从中抽样出来的数据作为正样例，而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布，默认为无偏的均匀分布。这里我们同时假设噪音样例 k 倍于数据样例，则训练数据被抽中的概率为\[[1](#参考文献)\]：
+
+$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$
+
+其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ，$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量，整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]：
+
+$$
+J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) }  \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$
+$$
+ \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) }  \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h)))  \right]$$
+
+总体上来说，NCE 是通过构造逻辑回归（logistic regression），对正样例和负样例做二分类，对于每一个样本，将自身的预测词 label 作为正样例，同时采样出 $k$ 个其他词 label 作为负样例，从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 softmax 分类需要计算每个类别的分数，然后归一化得到概率，节约了大量的时间消耗。
+
+## 实验数据
+本文采用 Penn Treebank (PTB) 数据集（[Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)）来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据，如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作，方便后期处理。语料语种为英文，共有42068句训练数据，3761句测试数据。
+
+## 网络结构
+N-gram 神经概率语言模型详细网络结构见图1：
+
+<p align="center">
+<img src="images/network_conf.png" width = "70%" align="center"/><br/>
+图1. 网络配置结构
+</p>
+可以看到，模型主要分为如下几个部分构成：
+
+1. **输入层**：输入的 ptb 样本由原始的英文单词组成，将每个英文单词转换为字典中的 id 表示，使用唯一的 id 表示可以区分每个单词。
+
+2. **词向量层**：比起原先的 id 表示，词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵，将原先的 id 表示转换为固定维度的词向量表示。训练完成之后，词语之间的语义相似度可以使用词向量之间的距离来表示，语义越相似，距离越近。
+
+3. **词向量拼接层**：将词向量进行串联，并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。
+
+4. **全连接隐层**：将上一层获得的长向量输入到一层隐层的神经网络，输出特征向量。全连接的隐层可以增强网络的学习能力。
+
+5. **NCE层**：训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。
+
+
+## 训练阶段
+训练直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集，如果未包含，则自动下载。运行过程中，每1000个 iteration 会打印模型训练信息，主要包含训练损失，每个 pass 会计算测试数据集上的损失，并同时会保存最新的模型快照。在 PaddlePaddle 中有已经实现好的 NCE Layer，一些参数需要自行根据实际场景进行设计，可参考的调参方案如下：
+
+
+| 参数名  | 参数作用  | 介绍 |
+|:------ |:-------| :--------|
+| param\_attr / bias\_attr | 用来设置参数名字 |         可以方便后面预测阶段好来实现网络的参数共享，具体内容在下一个章节里会陈述。|
+| num\_neg\_samples | 参数负责控制对负样例的采样个数。        |           可以控制正负样本比例，这个值取值区间为 [1, 字典大小-1]，负样本个数越多则整个模型的训练速度越慢，模型精度也会越高 |
+| neg\_distribution | 控制生成负样例标签的分布，默认是一个均匀分布。 | 可以自行控制负样本采样时各个类别的采样权重，比如希望正样例为“晴天”时，负样例“洪水”在训练时更被着重区分，则可以将“洪水”这个类别的采样权重增加。 |
+| act | 表示使用何种激活函数。 | 根据 NCE 的原理，这里应该使用 sigmoid 函数。 |
+
+
+具体代码实现如下：
+
+```python
+cost = paddle.layer.nce(
+    input=hidden_layer,
+    label=next_word,
+    num_classes=dict_size,
+    param_attr=paddle.attr.Param(name='nce_w'),
+    bias_attr=paddle.attr.Param(name='nce_b'),
+    act=paddle.activation.Sigmoid(),
+    num_neg_samples=25,
+    neg_distribution=None)
+```
+
+
+## 预测阶段
+预测直接运行` python infer.py `，程序首先会加载最新模型，然后按照 batch 大小依次进行预测，并打印预测结果。因为训练和预测计算逻辑不一样，预测阶段需要共享 NCE Layer 中的逻辑回归训练时得到的参数，所以要写一个推断层，推断层的参数为预先训练好的参数。
+
+具体实现推断层的方法：先是通过 `paddle.attr.Param` 方法获取参数值，然后使用 `paddle.layer.trans_full_matrix_projection` 对隐层输出向量 `hidden_layer` 做一个矩阵右乘，PaddlePaddle 会自行在模型中寻找相同参数名的参数并获取。右乘求和后得到类别向量，将类别向量输入 softmax 做一个归一操作，和为1，从而得到最后的类别概率分布。
+
+代码实现如下：
+
+```python
+with paddle.layer.mixed(
+        size=dict_size,
+        act=paddle.activation.Softmax(),
+        bias_attr=paddle.attr.Param(name='nce_b')) as prediction:
+    prediction += paddle.layer.trans_full_matrix_projection(
+        input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w'))
+```
+
+预测的输出形式为：
+
+```
+--------------------------
+No.68 Input: ' <unk> for possible
+Ground Truth Output: <unk>
+Predict Output: <unk>
+
+--------------------------
+No.69 Input: <unk> for possible <unk>
+Ground Truth Output: on
+Predict Output: <e>
+
+--------------------------
+No.70 Input: for possible <unk> on
+Ground Truth Output: the
+Predict Output: the
+
+```
+
+每一个短线表示一次的预测，第二行显示第几条测试样例，并给出输入的4个单词，第三行为真实的标签，第四行为预测的标签。
+
+## 参考文献
+1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273.
+
+2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
+
+3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758.
--- a/nce_cost/images/network_conf.png
+++ b/nce_cost/images/network_conf.png
--- a/nce_cost/infer.py
+++ b/nce_cost/infer.py
+# -*- encoding:utf-8 -*-
+import numpy as np
+import glob
+import gzip
+import paddle.v2 as paddle
+from nce_conf import network_conf
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    prediction_layer = network_conf(
+        is_train=False,
+        hidden_size=128,
+        embedding_size=512,
+        dict_size=dict_size)
+
+    models_list = glob.glob('./models/*')
+    models_list = sorted(models_list)
+
+    with gzip.open(models_list[-1], 'r') as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+
+    idx_word_dict = dict((v, k) for k, v in word_dict.items())
+    batch_size = 64
+    batch_ins = []
+    ins_iter = paddle.dataset.imikolov.test(word_dict, 5)
+
+    infer_data = []
+    infer_data_label = []
+    for item in paddle.dataset.imikolov.test(word_dict, 5)():
+        infer_data.append((item[:4]))
+        infer_data_label.append(item[4])
+        # Choose 100 samples from the test set to show how to infer.
+        if len(infer_data_label) == 100:
+            break
+
+    feeding = {
+        'firstw': 0,
+        'secondw': 1,
+        'thirdw': 2,
+        'fourthw': 3,
+        'fifthw': 4
+    }
+
+    predictions = paddle.infer(
+        output_layer=prediction_layer,
+        parameters=parameters,
+        input=infer_data,
+        feeding=feeding,
+        field=['value'])
+
+    for i, (prob, data,
+            label) in enumerate(zip(predictions, infer_data, infer_data_label)):
+        print '--------------------------'
+        print "No.%d Input: " % (i+1) + \
+                idx_word_dict[data[0]] + ' ' + \
+                idx_word_dict[data[1]] + ' ' + \
+                idx_word_dict[data[2]] + ' ' + \
+                idx_word_dict[data[3]]
+        print 'Ground Truth Output: ' + idx_word_dict[label]
+        print 'Predict Output: ' + idx_word_dict[prob.argsort(
+            kind='heapsort', axis=0)[-1]]
+        print
+
+
+if __name__ == '__main__':
+    main()
--- a/nce_cost/nce_conf.py
+++ b/nce_cost/nce_conf.py
+# -*- encoding:utf-8 -*-
+import math
+import paddle.v2 as paddle
+
+
+def network_conf(hidden_size, embedding_size, dict_size, is_train):
+
+    first_word = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    second_word = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    third_word = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourth_word = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    next_word = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    first_embedding = paddle.layer.embedding(
+        input=first_word, size=embedding_size, param_attr=embed_param_attr)
+    second_embedding = paddle.layer.embedding(
+        input=second_word, size=embedding_size, param_attr=embed_param_attr)
+    third_embedding = paddle.layer.embedding(
+        input=third_word, size=embedding_size, param_attr=embed_param_attr)
+    fourth_embedding = paddle.layer.embedding(
+        input=fourth_word, size=embedding_size, param_attr=embed_param_attr)
+
+    context_embedding = paddle.layer.concat(input=[
+        first_embedding, second_embedding, third_embedding, fourth_embedding
+    ])
+
+    hidden_layer = paddle.layer.fc(
+        input=context_embedding,
+        size=hidden_size,
+        act=paddle.activation.Tanh(),
+        bias_attr=paddle.attr.Param(learning_rate=1),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embedding_size * 8), learning_rate=1))
+
+    if is_train == True:
+        cost = paddle.layer.nce(
+            input=hidden_layer,
+            label=next_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name='nce_w'),
+            bias_attr=paddle.attr.Param(name='nce_b'),
+            act=paddle.activation.Sigmoid(),
+            num_neg_samples=25,
+            neg_distribution=None)
+        return cost
+    else:
+        with paddle.layer.mixed(
+                size=dict_size,
+                act=paddle.activation.Softmax(),
+                bias_attr=paddle.attr.Param(name='nce_b')) as prediction:
+            prediction += paddle.layer.trans_full_matrix_projection(
+                input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w'))
+
+        return prediction
--- a/nce_cost/train.py
+++ b/nce_cost/train.py
+# -*- encoding:utf-8 -*-
+import paddle.v2 as paddle
+import gzip
+
+from nce_conf import network_conf
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    cost = network_conf(
+        is_train=True, hidden_size=128, embedding_size=512, dict_size=dict_size)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.Adam(learning_rate=1e-4)
+    trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64))
+            print "Test here.. Pass %d, Cost %f" % (event.pass_id, result.cost)
+
+            model_name = "./models/model_pass_%05d.tar.gz" % event.pass_id
+            print "Save model into %s ..." % model_name
+            with gzip.open(model_name, 'w') as f:
+                parameters.to_tar(f)
+
+    feeding = {
+        'firstw': 0,
+        'secondw': 1,
+        'thirdw': 2,
+        'fourthw': 3,
+        'fifthw': 4
+    }
+
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64),
+        num_passes=1000,
+        event_handler=event_handler,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
--- a/nmt_without_attention/README.md
+++ b/nmt_without_attention/README.md
@@ -91,11 +91,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08
 ```python
 #### Decoder
 encoder_last = paddle.layer.last_seq(input=encoded_vector)
-with paddle.layer.mixed(
+encoder_last_projected = paddle.layer.mixed(
        size=decoder_size,
-        act=paddle.activation.Tanh()) as encoder_last_projected:
-        encoder_last_projected += paddle.layer.full_matrix_projection(
-            input=encoder_last)
+        act=paddle.activation.Tanh(),
+        input=paddle.layer.full_matrix_projection(input=encoder_last))
+
 # gru step
 def gru_decoder_without_attention(enc_vec, current_word):
    '''
@@ -112,10 +112,12 @@ def gru_decoder_without_attention(enc_vec, current_word):

    context = paddle.layer.last_seq(input=enc_vec)

-    with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-        decoder_inputs +=paddle.layer.full_matrix_projection(input=context)
-        decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+    decoder_inputs = paddle.layer.mixed(
+        size=decoder_size * 3,
+        input=[
+            paddle.layer.full_matrix_projection(input=context),
+            paddle.layer.full_matrix_projection(input=current_word)
+        ])

    gru_step = paddle.layer.gru_step(
        name='gru_decoder',
@@ -125,24 +127,24 @@ def gru_decoder_without_attention(enc_vec, current_word):
        output_mem=decoder_mem,
        size=decoder_size)

-    with paddle.layer.mixed(
-            size=target_dict_dim,
-            bias_attr=True,
-            act=paddle.activation.Softmax()) as out:
-        out += paddle.layer.full_matrix_projection(input=gru_step)
+    out = paddle.layer.mixed(
+        size=target_dict_dim,
+        bias_attr=True,
+        act=paddle.activation.Softmax(),
+        input=paddle.layer.full_matrix_projection(input=gru_step))
    return out
 ```

 在模型训练和测试阶段，解码器的行为有很大的不同：

 - **训练阶段**：目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`，函数`recurrent_group()`循环调用单步逻辑执行，最后计算目标翻译与实际解码的差异cost并返回；
- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。

 训练和生成的逻辑分别实现在如下的`if-else`条件分支中：

 ```python
 decoder_group_name = "decoder_group"
-group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
 group_inputs = [group_input1]
 if not generating:
    trg_embedding = paddle.layer.embedding(
@@ -166,7 +168,7 @@ if not generating:
    return cost
 else:

-    trg_embedding = paddle.layer.GeneratedInputV2(
+    trg_embedding = paddle.layer.GeneratedInput(
        size=target_dict_dim,
        embedding_name='_target_language_embedding',
        embedding_size=word_vector_dim)

--- a/nmt_without_attention/index.html
+++ b/nmt_without_attention/index.html
@@ -133,11 +133,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08
 ```python
 #### Decoder
 encoder_last = paddle.layer.last_seq(input=encoded_vector)
-with paddle.layer.mixed(
+encoder_last_projected = paddle.layer.mixed(
        size=decoder_size,
-        act=paddle.activation.Tanh()) as encoder_last_projected:
-        encoder_last_projected += paddle.layer.full_matrix_projection(
-            input=encoder_last)
+        act=paddle.activation.Tanh(),
+        input=paddle.layer.full_matrix_projection(input=encoder_last))
+
 # gru step
 def gru_decoder_without_attention(enc_vec, current_word):
    '''
@@ -154,10 +154,12 @@ def gru_decoder_without_attention(enc_vec, current_word):

    context = paddle.layer.last_seq(input=enc_vec)

-    with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-        decoder_inputs +=paddle.layer.full_matrix_projection(input=context)
-        decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+    decoder_inputs = paddle.layer.mixed(
+        size=decoder_size * 3,
+        input=[
+            paddle.layer.full_matrix_projection(input=context),
+            paddle.layer.full_matrix_projection(input=current_word)
+        ])

    gru_step = paddle.layer.gru_step(
        name='gru_decoder',
@@ -167,24 +169,24 @@ def gru_decoder_without_attention(enc_vec, current_word):
        output_mem=decoder_mem,
        size=decoder_size)

-    with paddle.layer.mixed(
-            size=target_dict_dim,
-            bias_attr=True,
-            act=paddle.activation.Softmax()) as out:
-        out += paddle.layer.full_matrix_projection(input=gru_step)
+    out = paddle.layer.mixed(
+        size=target_dict_dim,
+        bias_attr=True,
+        act=paddle.activation.Softmax(),
+        input=paddle.layer.full_matrix_projection(input=gru_step))
    return out
 ```

 在模型训练和测试阶段，解码器的行为有很大的不同：

 - **训练阶段**：目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`，函数`recurrent_group()`循环调用单步逻辑执行，最后计算目标翻译与实际解码的差异cost并返回；
- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。

 训练和生成的逻辑分别实现在如下的`if-else`条件分支中：

 ```python
 decoder_group_name = "decoder_group"
-group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
 group_inputs = [group_input1]
 if not generating:
    trg_embedding = paddle.layer.embedding(
@@ -208,7 +210,7 @@ if not generating:
    return cost
 else:

-    trg_embedding = paddle.layer.GeneratedInputV2(
+    trg_embedding = paddle.layer.GeneratedInput(
        size=target_dict_dim,
        embedding_name='_target_language_embedding',
        embedding_size=word_vector_dim)

--- a/nmt_without_attention/nmt_without_attention.py
+++ b/nmt_without_attention/nmt_without_attention.py
@@ -16,7 +16,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
    '''
    Define the network structure of NMT, including encoder and decoder.

-    :param source_dict_dim: size of source dictionary 
+    :param source_dict_dim: size of source dictionary
    :type source_dict_dim : int
    :param target_dict_dim: size of target dictionary
    :type target_dict_dim: int
@@ -41,11 +41,11 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
        return_seq=True)
    #### Decoder
    encoder_last = paddle.layer.last_seq(input=encoded_vector)
-    with paddle.layer.mixed(
-            size=decoder_size,
-            act=paddle.activation.Tanh()) as encoder_last_projected:
-        encoder_last_projected += paddle.layer.full_matrix_projection(
-            input=encoder_last)
+    encoder_last_projected = paddle.layer.mixed(
+        size=decoder_size,
+        act=paddle.activation.Tanh(),
+        input=paddle.layer.full_matrix_projection(input=encoder_last))
+
    # gru step
    def gru_decoder_without_attention(enc_vec, current_word):
        '''
@@ -63,10 +63,12 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):

        context = paddle.layer.last_seq(input=enc_vec)

-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -76,15 +78,15 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
            output_mem=decoder_mem,
            size=decoder_size)

-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out

    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
    group_inputs = [group_input1]

    if not generating:
@@ -109,7 +111,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False):
        return cost
    else:

-        trg_embedding = paddle.layer.GeneratedInputV2(
+        trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
@@ -194,7 +196,7 @@ def generate(source_dict_dim, target_dict_dim, init_models_path):
    beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
    with gzip.open(init_models_path) as f:
        parameters = paddle.parameters.Parameters.from_tar(f)
-    # prob is the prediction probabilities, and id is the prediction word. 
+    # prob is the prediction probabilities, and id is the prediction word.
    beam_result = paddle.infer(
        output_layer=beam_gen,
        parameters=parameters,
@@ -244,10 +246,10 @@ def main():
    target_language_dict_dim = 30000

    if generating:
-        # shoud pass the right generated model's path here
+        # modify this path to speicify a trained model.
        init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz'
        if not os.path.exists(init_models_path):
-            print "Cannot find models for generation"
+            print "trained model cannot be found."
            exit(1)
        generate(source_language_dict_dim, target_language_dict_dim,
                 init_models_path)