diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4efc176ff712c7d212dc0bec9a7d1aefc9d86d7f..5ed1f4c4beb6eae16d319b25ec9959b61fe3fbc3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,3 +25,11 @@ files: \.md$ - id: remove-tabs files: \.md$ +- repo: local + hooks: + - id: convert-markdown-into-html + name: convert-markdown-into-html + description: Convert README.md into index.html + entry: python .pre-commit-hooks/convert_markdown_into_html.py + language: system + files: .+README\.md$ diff --git a/.pre-commit-hooks/convert_markdown_into_html.py b/.pre-commit-hooks/convert_markdown_into_html.py new file mode 100644 index 0000000000000000000000000000000000000000..66f44ef23c5d9a82436dfbe4b6bcdfc4e69ab55a --- /dev/null +++ b/.pre-commit-hooks/convert_markdown_into_html.py @@ -0,0 +1,95 @@ +import argparse +import re +import sys + +HEAD = """ + + + + + + + + + + + + + + + + +
+
+ + + + + + + +""" + + +def convert_markdown_into_html(argv=None): + parser = argparse.ArgumentParser() + parser.add_argument('filenames', nargs='*', help='Filenames to fix') + args = parser.parse_args(argv) + + retv = 0 + + for filename in args.filenames: + with open( + re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)), + "w") as output: + output.write(HEAD) + with open(filename) as input: + for line in input: + output.write(line) + output.write(TAIL) + + return retv + + +if __name__ == '__main__': + sys.exit(convert_markdown_into_html()) diff --git a/README.md b/README.md index 4c97f74abe3b86abbe464fcb2b1332ae06f0cf7c..8fd9edfec30dfde19a8c79a979dc29abfbe2ce47 100644 --- a/README.md +++ b/README.md @@ -6,110 +6,54 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式构建起千变万化的深度学习模型来解决不同的应用问题。这里,我们针对常见的机器学习任务,提供了不同的神经网络模型供大家学习和使用。 -## [词向量](https://github.com/PaddlePaddle/models/tree/develop/word_embedding) -- **介绍** +## 1. 词向量 - [词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 是深度学习应用于自然语言处理领域最成功的概念和成果之一,是一种分散式表示(distributed representation)法。分散式表示法用一个更低维度的实向量表示词语,向量的每个维度在实数域取值,都表示文本的某种潜在语法或语义特征。广义地讲,词向量也可以应用于普通离散特征。词向量的学习通常都是一个无监督的学习过程,因此,可以充分利用海量的无标记数据以捕获特征之间的关系,也可以有效地解决特征稀疏、标签数据缺失、数据噪声等问题。 +词向量用一个实向量表示词语,向量的每个维都表示文本的某种潜在语法或语义特征,是深度学习应用于自然语言处理领域最成功的概念和成果之一。广义的,词向量也可以应用于普通离散特征。词向量的学习通常都是一个无监督的学习过程,因此,可以充分利用海量的无标记数据以捕获特征之间的关系,也可以有效地解决特征稀疏、标签数据缺失、数据噪声等问题。然而,在常见词向量学习方法中,模型最后一层往往会遇到一个超大规模的分类问题,是计算性能的瓶颈。 - 然而,在常见词向量学习方法中,模型最后一层往往会遇到一个超大规模的分类问题,是计算性能的瓶颈。在词向量的例子中,我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计(Noise Contrastive Estimation,NCE)来加速词向量的学习。 +在词向量的例子中,我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计(Noise Contrastive Estimation,NCE)来加速词向量的学习。 -- **应用领域** +- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding) - 词向量是深度学习方法引入自然语言处理领域的核心技术之一,在大规模无标记语料上训练的词向量常作为各种自然语言处理任务的预训练参数,是一种较为通用的资源,对任务性能的进一步提升有一定的帮助。同时,词嵌入的思想也是深度学习模型处理离散特征的重要方法,有着广泛地借鉴和参考意义。 +## 2. 点击率预估 - 词向量是搜索引擎、广告系统、推荐系统等互联网服务背后的常见基础技术之一。 +点击率预估模型预判用户对一条广告点击的概率,对每次广告的点击情况做出预测,是广告技术的核心算法之一。逻谛斯克回归对大规模稀疏特征有着很好的学习能力,在点击率预估任务发展的早期一统天下。近年来,DNN 模型由于其强大的学习能力逐渐接过点击率预估任务的大旗。 -- **模型配置** +在点击率预估的例子中,我们给出谷歌提出的 Wide & Deep 模型。这一模型融合了适用于学习抽象特征的 DNN 和适用于大规模稀疏特征的逻谛斯克回归两者模型的优点,可以作为一种相对成熟的模型框架使用, 在工业界也有一定的应用。 - 1. [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/blob/develop/word_embedding/hsigmoid_train.py) - 2. [噪声对比估计加速词向量训练]() +- 2.1 [Wide & deep 点击率预估模型](https://github.com/PaddlePaddle/models/tree/develop/ctr) -## [文本分类](https://github.com/PaddlePaddle/models/tree/develop/text_classification) +## 3. 文本分类 -- **介绍** +文本分类是自然语言处理领域最基础的任务之一,深度学习方法能够免除复杂的特征工程,直接使用原始文本作为输入,数据驱动地最优化分类准确率。 - 文本分类是自然语言处理领域最基础的任务之一,深度学习方法能够免除复杂的特征工程,直接使用原始文本作为输入,数据驱动地最优化分类准确率。我们以情感分类任务为例,提供了基于DNN的非序列文本分类模型,基于CNN和LSTM的序列模型供大家学习和使用。 +在文本分类的例子中,我们以情感分类任务为例,提供了基于DNN的非序列文本分类模型,以及基于CNN的序列模型供大家学习和使用(基于LSTM的模型见PaddleBook中[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一课)。 -- **应用领域** +- 3.1 [基于 DNN / CNN 的情感分类](https://github.com/PaddlePaddle/models/tree/develop/text_classification) - 分类是机器学习基础任务之一。文本分类模型在SPAM检测,文本打标签,文本意图识别,文章质量评估,色情暴力文章识别,评论情绪识别,广告物料风险控制等领域都有着广泛的应用。 +## 4. 排序学习 -- **模型配置** +排序学习(Learning to Rank, LTR)是信息检索和搜索引擎研究的核心问题之一,通过机器学习方法学习一个分值函数对待排序的候选进行打分,再根据分值的高低确定序关系。深度神经网络可以用来建模分值函数,构成各类基于深度学习的LTR模型。 - 1. [基于 DNN 的文本分类](https://github.com/PaddlePaddle/models/blob/develop/text_classification/text_classification_dnn.py) - 2. [基于 CNN 的文本分类](https://github.com/PaddlePaddle/models/blob/develop/text_classification/text_classification_cnn.py) - 3. [基于 LSTM 的文本分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/train.py) +在排序学习的例子中,我们介绍基于 RankLoss 损失函数的 Pairwise 排序模型和基于LambdaRank损失函数的Listwise排序模型(Pointwise学习策略见PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一课)。 -## [序列标注](https://github.com/PaddlePaddle/models/tree/develop/sequence_tagging_for_ner) +- 4.1 [基于 Pairwise 和 Listwise 的排序学习](https://github.com/PaddlePaddle/models/tree/develop/ltr) -- **介绍** +## 5. 序列标注 - 序列标注是自然语言处理中最常见的问题之一。在这一任务中,给定输入序列,模型为序列中每一个元素贴上一个类别标签。随着深度学习的不断探索和发展,利用循环神经网络模型学习输入序列的特征表示,条件随机场(Conditional Random Field, CRF)在特征基础上完成序列标注任务,逐渐成为解决序列标注问题的标配解决方案。深度学习的巨大优势在于:从原始文本中学习,避免复杂的特征工程,只要构建好这样一套深度学习模型,绝大多数序列标注问题都可以直接套用,只需要相应地替换不同问题对应的训练数据。 +给定输入序列,序列标注模型为序列中每一个元素贴上一个类别标签,是自然语言处理领域最基础的任务之一。随着深度学习的不断探索和发展,利用循环神经网络学习输入序列的特征表示,条件随机场(Conditional Random Field, CRF)在特征基础上完成序列标注任务,逐渐成为解决序列标注问题的标配解决方案。 - 这里,我们以命名实体识别(Named Entity Recognition,NER)任务为例,向大家介绍如何使用 PaddlePaddle 训练一个端到端(End-to-End)的序列标注模型。 +在序列标注的例子中,我们以命名实体识别(Named Entity Recognition,NER)任务为例,介绍如何训练一个端到端的序列标注模型。 -- **应用领域** +- 5.1 [命名实体识别](https://github.com/PaddlePaddle/models/tree/develop/sequence_tagging_for_ner) - 序列标注是自然语言处理领域最重要的基础任务之一,有着广泛地应用:自动分词,语言角色标注,命名实体识别,深度问答(Deep QA),关键词提取等问题都可以转化为序列标注问题直接套用本例中的模型。 +## 6. 序列到序列学习 -- **模型配置** +序列到序列学习实现两个甚至是多个不定长模型之间的映射,有着广泛的应用,包括:机器翻译、智能对话与问答、广告创意语料生成、自动编码(如金融画像编码)、判断多个文本串之间的语义相关性等。 - 1. [命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/ner.py) +在序列到序列学习的例子中,我们以机器翻译任务为例,提供了多种改进模型,供大家学习和使用。包括:不带注意力机制的序列到序列映射模型,这一模型是所有序列到序列学习模型的基础;使用 scheduled sampling 改善 RNN 模型在生成任务中的错误累积问题;带外部记忆机制的神经机器翻译,通过增强神经网络的记忆能力,来完成复杂的序列到序列学习任务。 -## [排序学习](https://github.com/PaddlePaddle/models/tree/develop/ltr) - -- **介绍** - - 排序学习(Learning to Rank,下简称LTR)是信息检索和搜索引擎研究的核心问题之一,通过机器学习方法学习一个分值函数(Scoring Function)对待排序的候选进行打分,再根据分值的高低确定序关系。深度神经网络可以用来建模分值函数,构成各类基于深度学习的LTR模型。 - - 以信息检索任务为例,给定查询以及检索到的候选文档列表,LTR系统需要按照查询与候选文档的相关性,对候选文档进行打分并排序。流行的LTR学习方法分为以下三种: - - - [Pointwise](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md) - - Pointwise 学习方法将LTR被转化为回归或是分类问题。 - - 给定查询以及一个候选文档,模型基于序数进行二分类、多分类或者回归拟合,是一种基础的LTR学习策略。 - - Pairwise - - Pairwise学习方法将排序问题归约为对有序对(ordered pair)的分类,比Pointwise方法更近了一步。 - - 模型判断一对候选文档中,哪一个与给定查询更相关,学习的目标为是最小化误分类文档对的数量。 - - 理想情况下,如果所有文档对都能被正确的分类,那么原始的候选文档也会被正确的排序。 - - Listwise - - 与Pointwise与Pairwise 学习方法相比,Listwise方法将给定查询对应的整个候选文档集合列表(list)作为输入,直接对排序结果列表进行优化。Listwise方法在损失函数中考虑了文档排序的位置因素,是前两种方法所不具备的。 - - Pointwise 学习策略是PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一节介绍过的方法。这里,我们一进步提供了基于 RankLoss 损失函数的 Pairwise 排序模型和基于LambdaRank损失函数的Listwise排序模型。 - -- **应用领域** - - LTR模型在搜索排序,包括:图片搜索排序、外卖美食搜索排序、App搜索排序、酒店搜索排序等场景中有着广泛的应用,还可以扩展应用于:关键词推荐、各类业务榜单、个性化推荐等任务。 - -- **模型配置说明** - - 1. [Pointwise 排序模型](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/train.py) - 2. [Pairwise 排序模型](https://github.com/PaddlePaddle/models/blob/develop/ltr/ranknet.py) - 3. [Listwise 排序模型](https://github.com/PaddlePaddle/models/blob/develop/ltr/lambda_rank.py) - -## [文本生成](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention) - -- **介绍** - - 我们期待有一天机器可以使用自然语言与人们进行交流,像人一样能够撰写高质量的自然语言文本,自动文本生成是实现这一目标的关键技术,可以应用于机器翻译系统、对话系统、问答系统等,为人们带来更加有趣地交互体验,也可以自动撰写新闻摘要,撰写歌词,简单的故事等等。或许未来的某一天,机器能够代替编辑,作家,歌词作者,颠覆这些内容创作领域的工作方式。 - - 基于神经网络生成文本常使用两类方法:1. 语言模型;2. 序列到序列(sequence to sequence)映射模型。在文本生成的例子中,我们为大家展示如何使用以上两种模型来自动生成文本。 - - 特别的,对序列到序列映射模型,我们以机器翻译任务为例提供了多种改进模型,供大家学习和使用,包括: - 1. 不带注意力机制的序列到序列映射模型,这一模型是所有序列到序列学习模型的基础。 - 2. 带注意力机制使用 scheduled sampling 改善生成质量,用来改善RNN模型在文本生成过程中的错误累积问题。 - 3. 带外部记忆机制的神经机器翻译,通过增强神经网络的记忆能力,来完成复杂的序列到序列学习任务。 - - -- **应用领域** - - 文本生成模型实现了两个甚至是多个不定长模型之间的映射,有着广泛地应用,包括机器翻译、智能对话与问答、广告创意语料生成、自动编码(如金融画像编码)、判断多个文本串之间的语义相关性等。 - -- **模型配置** - - 1. [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/nmt_without_attention.py) - 2. [使用 scheduled sampling 改善生成质量]() - 3. [ 带外部记忆机制的神经机器翻译]() +- 6.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention) ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). diff --git a/ctr/README.md b/ctr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9332a8516e72d9df6c81ee0faf7c44264e43c0a6 --- /dev/null +++ b/ctr/README.md @@ -0,0 +1,236 @@ +# 点击率预估 + +## 背景介绍 + +CTR(Click-Through Rate,点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率, +通常被用来衡量一个在线广告系统的有效性。 + +当有多个广告位时,CTR 预估一般会作为排序的基准。 +比如在搜索引擎的广告系统里,当用户输入一个带商业价值的搜索词(query)时,系统大体上会执行下列步骤来展示广告: + +1. 召回满足 query 的广告集合 +2. 业务规则和相关性过滤 +3. 根据拍卖机制和 CTR 排序 +4. 展出广告 + +可以看到,CTR 在最终排序中起到了很重要的作用。 + +### 发展阶段 +在业内,CTR 模型经历了如下的发展阶段: + +- Logistic Regression(LR) / GBDT + 特征工程 +- LR + DNN 特征 +- DNN + 特征工程 + +在发展早期时 LR 一统天下,但最近 DNN 模型由于其强大的学习能力和逐渐成熟的性能优化, +逐渐地接过 CTR 预估任务的大旗。 + + +### LR vs DNN + +下图展示了 LR 和一个 \(3x2\) 的 DNN 模型的结构: + +

+
+Figure 1. LR 和 DNN 模型结构对比 +

+ +LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构,可以看到 LR 和 DNN 有一些共通之处(比如权重累加), +但前者的模型复杂度在相同输入维度下比后者可能低很多(从某方面讲,模型越复杂,越有潜力学习到更复杂的信息)。 + +如果 LR 要达到匹敌 DNN 的学习能力,必须增加输入的维度,也就是增加特征的数量, +这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。 + +LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力,包括内存和计算量等方面,工业界都有非常成熟的优化方法。 + +而 DNN 模型具有自己学习新特征的能力,一定程度上能够提升特征使用的效率, +这使得 DNN 模型在同样规模特征的情况下,更有可能达到更好的学习效果。 + +本文后面的章节会演示如何使用 PaddlePaddle 编写一个结合两者优点的模型。 + + +## 数据和任务抽象 + +我们可以将 `click` 作为学习目标,任务可以有以下几种方案: + +1. 直接学习 click,0,1 作二元分类 +2. Learning to rank, 具体用 pairwise rank(标签 1>0)或者 listwise rank +3. 统计每个广告的点击率,将同一个 query 下的广告两两组合,点击率高的>点击率低的,做 rank 或者分类 + +我们直接使用第一种方法做分类任务。 + +我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。 + +具体的特征处理方法参看 [data process](./dataset.md) + + +## Wide & Deep Learning Model + +谷歌在 16 年提出了 Wide & Deep Learning 的模型框架,用于融合适合学习抽象特征的 DNN 和 适用于大规模稀疏特征的 LR 两种模型的优点。 + + +### 模型简介 + +Wide & Deep Learning Model\[[3](#参考文献)\] 可以作为一种相对成熟的模型框架使用, +在 CTR 预估的任务中工业界也有一定的应用,因此本文将演示使用此模型来完成 CTR 预估的任务。 + +模型结构如下: + +

+
+Figure 2. Wide & Deep Model +

+ +模型左边的 Wide 部分,可以容纳大规模系数特征,并且对一些特定的信息(比如 ID)有一定的记忆能力; +而模型右边的 Deep 部分,能够学习特征间的隐含关系,在相同数量的特征下有更好的学习和推导能力。 + + +### 编写模型输入 + +模型只接受 3 个输入,分别是 + +- `dnn_input` ,也就是 Deep 部分的输入 +- `lr_input` ,也就是 Wide 部分的输入 +- `click` , 点击与否,作为二分类模型学习的标签 + +```python +dnn_merged_input = layer.data( + name='dnn_input', + type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input'])) + +lr_merged_input = layer.data( + name='lr_input', + type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input'])) + +click = paddle.layer.data(name='click', type=dtype.dense_vector(1)) +``` + +### 编写 Wide 部分 + +Wide 部分直接使用了 LR 模型,但激活函数改成了 `RELU` 来加速 + +```python +def build_lr_submodel(): + fc = layer.fc( + input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu()) + return fc +``` + +### 编写 Deep 部分 + +Deep 部分使用了标准的多层前向传导的 DNN 模型 + +```python +def build_dnn_submodel(dnn_layer_dims): + dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0]) + _input_layer = dnn_embedding + for i, dim in enumerate(dnn_layer_dims[1:]): + fc = layer.fc( + input=_input_layer, + size=dim, + act=paddle.activation.Relu(), + name='dnn-fc-%d' % i) + _input_layer = fc + return _input_layer +``` + +### 两者融合 + +两个 submodel 的最上层输出加权求和得到整个模型的输出,输出部分使用 `sigmoid` 作为激活函数,得到区间 (0,1) 的预测值, +来逼近训练数据中二元类别的分布,并最终作为 CTR 预估的值使用。 + +```python +# conbine DNN and LR submodels +def combine_submodels(dnn, lr): + merge_layer = layer.concat(input=[dnn, lr]) + fc = layer.fc( + input=merge_layer, + size=1, + name='output', + # use sigmoid function to approximate ctr, wihch is a float value between 0 and 1. + act=paddle.activation.Sigmoid()) + return fc +``` + +### 训练任务的定义 +```python +dnn = build_dnn_submodel(dnn_layer_dims) +lr = build_lr_submodel() +output = combine_submodels(dnn, lr) + +# ============================================================================== +# cost and train period +# ============================================================================== +classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost( + input=output, label=click) + + +paddle.init(use_gpu=False, trainer_count=11) + +params = paddle.parameters.create(classification_cost) + +optimizer = paddle.optimizer.Momentum(momentum=0) + +trainer = paddle.trainer.SGD( + cost=classification_cost, parameters=params, update_equation=optimizer) + +dataset = AvazuDataset(train_data_path, n_records_as_test=test_set_size) + +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + logging.warning("Pass %d, Samples %d, Cost %f" % ( + event.pass_id, event.batch_id * batch_size, event.cost)) + + if event.batch_id % 1000 == 0: + result = trainer.test( + reader=paddle.batch(dataset.test, batch_size=1000), + feeding=field_index) + logging.warning("Test %d-%d, Cost %f" % (event.pass_id, event.batch_id, + result.cost)) + + +trainer.train( + reader=paddle.batch( + paddle.reader.shuffle(dataset.train, buf_size=500), + batch_size=batch_size), + feeding=field_index, + event_handler=event_handler, + num_passes=100) +``` +## 运行训练和测试 +训练模型需要如下步骤: + +1. 下载训练数据,可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\] + 1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz + 2. 解压 train.gz 得到 train.txt +2. 执行 `python train.py --train_data_path train.txt` ,开始训练 + +上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程,具体的命令行参数及用法如下 + +``` +usage: train.py [-h] --train_data_path TRAIN_DATA_PATH + [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE] + [--num_passes NUM_PASSES] + [--num_lines_to_detact NUM_LINES_TO_DETACT] + +PaddlePaddle CTR example + +optional arguments: + -h, --help show this help message and exit + --train_data_path TRAIN_DATA_PATH + path of training dataset + --batch_size BATCH_SIZE + size of mini-batch (default:10000) + --test_set_size TEST_SET_SIZE + size of the validation dataset(default: 10000) + --num_passes NUM_PASSES + number of passes to train + --num_lines_to_detact NUM_LINES_TO_DETACT + number of records to detect dataset's meta info +``` + +## 参考文献 +1. +2. +3. Cheng H T, Koc L, Harmsen J, et al. [Wide & deep learning for recommender systems](https://arxiv.org/pdf/1606.07792.pdf)[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10. diff --git a/ctr/data_provider.py b/ctr/data_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..f02d3d33e75163cf772921ef54729a3fc8da022b --- /dev/null +++ b/ctr/data_provider.py @@ -0,0 +1,277 @@ +import sys +import csv +import numpy as np +''' +The fields of the dataset are: + + 0. id: ad identifier + 1. click: 0/1 for non-click/click + 2. hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC. + 3. C1 -- anonymized categorical variable + 4. banner_pos + 5. site_id + 6. site_domain + 7. site_category + 8. app_id + 9. app_domain + 10. app_category + 11. device_id + 12. device_ip + 13. device_model + 14. device_type + 15. device_conn_type + 16. C14-C21 -- anonymized categorical variables + +We will treat following fields as categorical features: + + - C1 + - banner_pos + - site_category + - app_category + - device_type + - device_conn_type + +and some other features as id features: + + - id + - site_id + - app_id + - device_id + +The `hour` field will be treated as a continuous feature and will be transformed +to one-hot representation which has 24 bits. +''' + +feature_dims = {} + +categorial_features = ('C1 banner_pos site_category app_category ' + + 'device_type device_conn_type').split() + +id_features = 'id site_id app_id device_id _device_id_cross_site_id'.split() + + +def get_all_field_names(mode=0): + ''' + @mode: int + 0 for train, 1 for test + @return: list of str + ''' + return categorial_features + ['hour'] + id_features + ['click'] \ + if mode == 0 else [] + + +class CategoryFeatureGenerator(object): + ''' + Generator category features. + + Register all records by calling `register` first, then call `gen` to generate + one-hot representation for a record. + ''' + + def __init__(self): + self.dic = {'unk': 0} + self.counter = 1 + + def register(self, key): + ''' + Register record. + ''' + if key not in self.dic: + self.dic[key] = self.counter + self.counter += 1 + + def size(self): + return len(self.dic) + + def gen(self, key): + ''' + Generate one-hot representation for a record. + ''' + if key not in self.dic: + res = self.dic['unk'] + else: + res = self.dic[key] + return [res] + + def __repr__(self): + return '' % len(self.dic) + + +class IDfeatureGenerator(object): + def __init__(self, max_dim, cross_fea0=None, cross_fea1=None): + ''' + @max_dim: int + Size of the id elements' space + ''' + self.max_dim = max_dim + self.cross_fea0 = cross_fea0 + self.cross_fea1 = cross_fea1 + + def gen(self, key): + ''' + Generate one-hot representation for records + ''' + return [hash(key) % self.max_dim] + + def gen_cross_fea(self, fea1, fea2): + key = str(fea1) + str(fea2) + return self.gen(key) + + def size(self): + return self.max_dim + + +class ContinuousFeatureGenerator(object): + def __init__(self, n_intervals): + self.min = sys.maxint + self.max = sys.minint + self.n_intervals = n_intervals + + def register(self, val): + self.min = min(self.minint, val) + self.max = max(self.maxint, val) + + def gen(self, val): + self.len_part = (self.max - self.min) / self.n_intervals + return (val - self.min) / self.len_part + + +# init all feature generators +fields = {} +for key in categorial_features: + fields[key] = CategoryFeatureGenerator() +for key in id_features: + # for cross features + if 'cross' in key: + feas = key[1:].split('_cross_') + fields[key] = IDfeatureGenerator(10000000, *feas) + # for normal ID features + else: + fields[key] = IDfeatureGenerator(10000) + +# used as feed_dict in PaddlePaddle +field_index = dict((key, id) + for id, key in enumerate(['dnn_input', 'lr_input', 'click'])) + + +def detect_dataset(path, topn, id_fea_space=10000): + ''' + Parse the first `topn` records to collect meta information of this dataset. + + NOTE the records should be randomly shuffled first. + ''' + # create categorical statis objects. + + with open(path, 'rb') as csvfile: + reader = csv.DictReader(csvfile) + for row_id, row in enumerate(reader): + if row_id > topn: + break + + for key in categorial_features: + fields[key].register(row[key]) + + for key, item in fields.items(): + feature_dims[key] = item.size() + + #for key in id_features: + #feature_dims[key] = id_fea_space + + feature_dims['hour'] = 24 + feature_dims['click'] = 1 + + feature_dims['dnn_input'] = np.sum( + feature_dims[key] for key in categorial_features + ['hour']) + 1 + feature_dims['lr_input'] = np.sum(feature_dims[key] + for key in id_features) + 1 + + return feature_dims + + +def concat_sparse_vectors(inputs, dims): + ''' + Concaterate more than one sparse vectors into one. + + @inputs: list + list of sparse vector + @dims: list of int + dimention of each sparse vector + ''' + res = [] + assert len(inputs) == len(dims) + start = 0 + for no, vec in enumerate(inputs): + for v in vec: + res.append(v + start) + start += dims[no] + return res + + +class AvazuDataset(object): + ''' + Load AVAZU dataset as train set. + ''' + TRAIN_MODE = 0 + TEST_MODE = 1 + + def __init__(self, train_path, n_records_as_test=-1): + self.train_path = train_path + self.n_records_as_test = n_records_as_test + # task model: 0 train, 1 test + self.mode = 0 + + def train(self): + self.mode = self.TRAIN_MODE + return self._parse(self.train_path, skip_n_lines=self.n_records_as_test) + + def test(self): + self.mode = self.TEST_MODE + return self._parse(self.train_path, top_n_lines=self.n_records_as_test) + + def _parse(self, path, skip_n_lines=-1, top_n_lines=-1): + with open(path, 'rb') as csvfile: + reader = csv.DictReader(csvfile) + + categorial_dims = [ + feature_dims[key] for key in categorial_features + ['hour'] + ] + id_dims = [feature_dims[key] for key in id_features] + + for row_id, row in enumerate(reader): + if skip_n_lines > 0 and row_id < skip_n_lines: + continue + if top_n_lines > 0 and row_id > top_n_lines: + break + + record = [] + for key in categorial_features: + record.append(fields[key].gen(row[key])) + record.append([int(row['hour'][-2:])]) + dense_input = concat_sparse_vectors(record, categorial_dims) + + record = [] + for key in id_features: + if 'cross' not in key: + record.append(fields[key].gen(row[key])) + else: + fea0 = fields[key].cross_fea0 + fea1 = fields[key].cross_fea1 + record.append( + fields[key].gen_cross_fea(row[fea0], row[fea1])) + + sparse_input = concat_sparse_vectors(record, id_dims) + + record = [dense_input, sparse_input] + + record.append(list((int(row['click']), ))) + yield record + + +if __name__ == '__main__': + path = 'train.txt' + print detect_dataset(path, 400000) + + filereader = AvazuDataset(path) + for no, rcd in enumerate(filereader.train()): + print no, rcd + if no > 1000: break diff --git a/ctr/dataset.md b/ctr/dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..dd6443d56adaf548d6c39458900c711c7f274def --- /dev/null +++ b/ctr/dataset.md @@ -0,0 +1,289 @@ +# 数据及处理 +## 数据集介绍 + +数据集使用 `csv` 格式存储,其中各个字段内容如下: + +- `id` : ad identifier +- `click` : 0/1 for non-click/click +- `hour` : format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC. +- `C1` : anonymized categorical variable +- `banner_pos` +- `site_id` +- `site_domain` +- `site_category` +- `app_id` +- `app_domain` +- `app_category` +- `device_id` +- `device_ip` +- `device_model` +- `device_type` +- `device_conn_type` +- `C14-C21` : anonymized categorical variables + + +## 特征提取 + +下面我们会简单演示几种特征的提取方式。 + +原始数据中的特征可以分为以下几类: + +1. ID 类特征(稀疏,数量多) +- `id` +- `site_id` +- `app_id` +- `device_id` + +2. 类别类特征(稀疏,但数量有限) + +- `C1` +- `site_category` +- `device_type` +- `C14-C21` + +3. 数值型特征转化为类别型特征 + +- hour (可以转化成数值,也可以按小时为单位转化为类别) + +### 类别类特征 + +类别类特征的提取方法有以下两种: + +1. One-hot 表示作为特征 +2. 类似词向量,用一个 Embedding 将每个类别映射到对应的向量 + + +### ID 类特征 + +ID 类特征的特点是稀疏数据,但量比较大,直接使用 One-hot 表示时维度过大。 + +一般会作如下处理: + +1. 确定表示的最大维度 N +2. newid = id % N +3. 用 newid 作为类别类特征使用 + +上面的方法尽管存在一定的碰撞概率,但能够处理任意数量的 ID 特征,并保留一定的效果\[[2](#参考文献)\]。 + +### 数值型特征 + +一般会做如下处理: + +- 归一化,直接作为特征输入模型 +- 用区间分割处理成类别类特征,稀疏化表示,模糊细微上的差别 + +## 特征处理 + + +### 类别型特征 + +类别型特征有有限多种值,在模型中,我们一般使用 Embedding将每种值映射为连续值的向量。 + +这种特征在输入到模型时,一般使用 One-hot 表示,相关处理方法如下: + +```python +class CategoryFeatureGenerator(object): + ''' + Generator category features. + + Register all records by calling ~register~ first, then call ~gen~ to generate + one-hot representation for a record. + ''' + + def __init__(self): + self.dic = {'unk': 0} + self.counter = 1 + + def register(self, key): + ''' + Register record. + ''' + if key not in self.dic: + self.dic[key] = self.counter + self.counter += 1 + + def size(self): + return len(self.dic) + + def gen(self, key): + ''' + Generate one-hot representation for a record. + ''' + if key not in self.dic: + res = self.dic['unk'] + else: + res = self.dic[key] + return [res] + + def __repr__(self): + return '' % len(self.dic) +``` + +`CategoryFeatureGenerator` 需要先扫描数据集,得到该类别对应的项集合,之后才能开始生成特征。 + +我们的实验数据集\[[3](https://www.kaggle.com/c/avazu-ctr-prediction/data)\]已经经过shuffle,可以扫描前面一定数目的记录来近似总的类别项集合(等价于随机抽样), +对于没有抽样上的低频类别项,可以用一个 UNK 的特殊值表示。 + +```python +fields = {} +for key in categorial_features: + fields[key] = CategoryFeatureGenerator() + +def detect_dataset(path, topn, id_fea_space=10000): + ''' + Parse the first `topn` records to collect meta information of this dataset. + + NOTE the records should be randomly shuffled first. + ''' + # create categorical statis objects. + + with open(path, 'rb') as csvfile: + reader = csv.DictReader(csvfile) + for row_id, row in enumerate(reader): + if row_id > topn: + break + + for key in categorial_features: + fields[key].register(row[key]) +``` + +`CategoryFeatureGenerator` 在注册得到数据集中对应类别信息后,可以对相应记录生成对应的特征表示: + +```python +record = [] +for key in categorial_features: + record.append(fields[key].gen(row[key])) +``` + +本任务中,类别类特征会输入到 DNN 中使用。 + +### ID 类特征 + +ID 类特征代稀疏值,且值的空间很大的情况,一般用模操作规约到一个有限空间, +之后可以当成类别类特征使用,这里我们会将 ID 类特征输入到 LR 模型中使用。 + +```python +class IDfeatureGenerator(object): + def __init__(self, max_dim): + ''' + @max_dim: int + Size of the id elements' space + ''' + self.max_dim = max_dim + + def gen(self, key): + ''' + Generate one-hot representation for records + ''' + return [hash(key) % self.max_dim] + + def size(self): + return self.max_dim +``` + +`IDfeatureGenerator` 不需要预先初始化,可以直接生成特征,比如 + +```python +record = [] +for key in id_features: + if 'cross' not in key: + record.append(fields[key].gen(row[key])) +``` + +### 交叉类特征 + +LR 模型作为 Wide & Deep model 的 `wide` 部分,可以输入很 wide 的数据(特征空间的维度很大), +为了充分利用这个优势,我们将演示交叉组合特征构建成更大维度特征的情况,之后塞入到模型中训练。 + +这里我们依旧使用模操作来约束最终组合出的特征空间的大小,具体实现是直接在 `IDfeatureGenerator` 中添加一个 `gen_cross_feature` 的方法: + +```python +def gen_cross_fea(self, fea1, fea2): + key = str(fea1) + str(fea2) + return self.gen(key) +``` + +比如,我们觉得原始数据中, `device_id` 和 `site_id` 有一些关联(比如某个 device 倾向于浏览特定 site), +我们通过组合出两者组合来捕捉这类信息。 + +```python +fea0 = fields[key].cross_fea0 +fea1 = fields[key].cross_fea1 +record.append( + fields[key].gen_cross_fea(row[fea0], row[fea1])) +``` + +### 特征维度 +#### Deep submodel(DNN)特征 +| feature | dimention | +|------------------|-----------| +| app_category | 21 | +| site_category | 22 | +| device_conn_type | 5 | +| hour | 24 | +| banner_pos | 7 | +| **Total** | 79 | + +#### Wide submodel(LR)特征 +| Feature | Dimention | +|---------------------|-----------| +| id | 10000 | +| site_id | 10000 | +| app_id | 10000 | +| device_id | 10000 | +| device_id X site_id | 1000000 | +| **Total** | 1,040,000 | + +## 输入到 PaddlePaddle 中 + +Deep 和 Wide 两部分均以 `sparse_binary_vector` 的格式 \[[1](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst)\] 输入,输入前需要将相关特征拼合,模型最终只接受 3 个 input, +分别是 + +1. `dnn input` ,DNN 的输入 +2. `lr input` , LR 的输入 +3. `click` , 标签 + +拼合特征的方法: + +```python +def concat_sparse_vectors(inputs, dims): + ''' + concaterate sparse vectors into one + + @inputs: list + list of sparse vector + @dims: list of int + dimention of each sparse vector + ''' + res = [] + assert len(inputs) == len(dims) + start = 0 + for no, vec in enumerate(inputs): + for v in vec: + res.append(v + start) + start += dims[no] + return res +``` + +生成最终特征的代码如下: + +```python +# dimentions of the features +categorial_dims = [ + feature_dims[key] for key in categorial_features + ['hour'] +] +id_dims = [feature_dims[key] for key in id_features] + +dense_input = concat_sparse_vectors(record, categorial_dims) +sparse_input = concat_sparse_vectors(record, id_dims) + +record = [dense_input, sparse_input] +record.append(list((int(row['click']), ))) +yield record +``` + +## 参考文献 + +1. +2. Mikolov T, Deoras A, Povey D, et al. [Strategies for training large scale neural network language models](https://www.researchgate.net/profile/Lukas_Burget/publication/241637478_Strategies_for_training_large_scale_neural_network_language_models/links/542c14960cf27e39fa922ed3.pdf)[C]//Automatic Speech Recognition and Understanding (ASRU), 2011 IEEE Workshop on. IEEE, 2011: 196-201. +3. diff --git a/ctr/images/lr_vs_dnn.jpg b/ctr/images/lr_vs_dnn.jpg new file mode 100644 index 0000000000000000000000000000000000000000..50a0db583cd9b6e1a5bc0f83a28ab6e22d649931 Binary files /dev/null and b/ctr/images/lr_vs_dnn.jpg differ diff --git a/ctr/images/wide_deep.png b/ctr/images/wide_deep.png new file mode 100644 index 0000000000000000000000000000000000000000..03c4afcfc6cea0b5abf4c4554ecf9810843e75e2 Binary files /dev/null and b/ctr/images/wide_deep.png differ diff --git a/ctr/index.html b/ctr/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ff0c5d9b19ec046b61f7f38d6eb9e70dff33e1ec --- /dev/null +++ b/ctr/index.html @@ -0,0 +1,300 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/ctr/train.py b/ctr/train.py new file mode 100644 index 0000000000000000000000000000000000000000..da6dc9dd6d9e386a87693b5a5bc0cbf95da0b069 --- /dev/null +++ b/ctr/train.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import logging +import paddle.v2 as paddle +from paddle.v2 import layer +from paddle.v2 import data_type as dtype +from data_provider import field_index, detect_dataset, AvazuDataset + +parser = argparse.ArgumentParser(description="PaddlePaddle CTR example") +parser.add_argument( + '--train_data_path', + type=str, + required=True, + help="path of training dataset") +parser.add_argument( + '--batch_size', + type=int, + default=10000, + help="size of mini-batch (default:10000)") +parser.add_argument( + '--test_set_size', + type=int, + default=10000, + help="size of the validation dataset(default: 10000)") +parser.add_argument( + '--num_passes', type=int, default=10, help="number of passes to train") +parser.add_argument( + '--num_lines_to_detact', + type=int, + default=500000, + help="number of records to detect dataset's meta info") + +args = parser.parse_args() + +dnn_layer_dims = [128, 64, 32, 1] +data_meta_info = detect_dataset(args.train_data_path, args.num_lines_to_detact) + +logging.warning('detect categorical fields in dataset %s' % + args.train_data_path) +for key, item in data_meta_info.items(): + logging.warning(' - {}\t{}'.format(key, item)) + +paddle.init(use_gpu=False, trainer_count=1) + +# ============================================================================== +# input layers +# ============================================================================== +dnn_merged_input = layer.data( + name='dnn_input', + type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input'])) + +lr_merged_input = layer.data( + name='lr_input', + type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input'])) + +click = paddle.layer.data(name='click', type=dtype.dense_vector(1)) + + +# ============================================================================== +# network structure +# ============================================================================== +def build_dnn_submodel(dnn_layer_dims): + dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0]) + _input_layer = dnn_embedding + for i, dim in enumerate(dnn_layer_dims[1:]): + fc = layer.fc( + input=_input_layer, + size=dim, + act=paddle.activation.Relu(), + name='dnn-fc-%d' % i) + _input_layer = fc + return _input_layer + + +# config LR submodel +def build_lr_submodel(): + fc = layer.fc( + input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu()) + return fc + + +# conbine DNN and LR submodels +def combine_submodels(dnn, lr): + merge_layer = layer.concat(input=[dnn, lr]) + fc = layer.fc( + input=merge_layer, + size=1, + name='output', + # use sigmoid function to approximate ctr rate, a float value between 0 and 1. + act=paddle.activation.Sigmoid()) + return fc + + +dnn = build_dnn_submodel(dnn_layer_dims) +lr = build_lr_submodel() +output = combine_submodels(dnn, lr) + +# ============================================================================== +# cost and train period +# ============================================================================== +classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost( + input=output, label=click) + +params = paddle.parameters.create(classification_cost) + +optimizer = paddle.optimizer.Momentum(momentum=0.01) + +trainer = paddle.trainer.SGD( + cost=classification_cost, parameters=params, update_equation=optimizer) + +dataset = AvazuDataset( + args.train_data_path, n_records_as_test=args.test_set_size) + + +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + num_samples = event.batch_id * args.batch_size + if event.batch_id % 100 == 0: + logging.warning("Pass %d, Samples %d, Cost %f" % + (event.pass_id, num_samples, event.cost)) + + if event.batch_id % 1000 == 0: + result = trainer.test( + reader=paddle.batch(dataset.test, batch_size=args.batch_size), + feeding=field_index) + logging.warning("Test %d-%d, Cost %f" % + (event.pass_id, event.batch_id, result.cost)) + + +trainer.train( + reader=paddle.batch( + paddle.reader.shuffle(dataset.train, buf_size=500), + batch_size=args.batch_size), + feeding=field_index, + event_handler=event_handler, + num_passes=args.num_passes) diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a372e9bed262d2ee5bc8640a0f480b9ce34cd34 --- /dev/null +++ b/deep_speech_2/README.md @@ -0,0 +1,65 @@ +# Deep Speech 2 on PaddlePaddle + +## Installation + +Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. + +``` +pip install -r requirements.txt +export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH +``` + +For some machines, we also need to install libsndfile1. Details to be added. + +## Usage + +### Preparing Data + +``` +cd data +python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all +cd .. +``` + +After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. + +By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. + +More help for arguments: + +``` +python librispeech.py --help +``` + +### Traininig + +For GPU Training: + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all +``` + +For CPU Training: + +``` +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all +``` + +More help for arguments: + +``` +python train.py --help +``` + +### Inferencing + +``` +python infer.py +``` + +More help for arguments: + +``` +python infer.py --help +``` diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c717bcf182811d1b043bf0e83e2e31209be18e46 --- /dev/null +++ b/deep_speech_2/audio_data_utils.py @@ -0,0 +1,383 @@ +""" + Providing basic audio data preprocessing pipeline, and offering + both instance-level and batch-level data reader interfaces. +""" +import paddle.v2 as paddle +import logging +import json +import random +import soundfile +import numpy as np +import os + +RANDOM_SEED = 0 +logger = logging.getLogger(__name__) + + +class DataGenerator(object): + """ + DataGenerator provides basic audio data preprocessing pipeline, and offers + both instance-level and batch-level data reader interfaces. + Normalized FFT are used as audio features here. + + :param vocab_filepath: Vocabulary file path for indexing tokenized + transcriptions. + :type vocab_filepath: basestring + :param normalizer_manifest_path: Manifest filepath for collecting feature + normalization statistics, e.g. mean, std. + :type normalizer_manifest_path: basestring + :param normalizer_num_samples: Number of instances sampled for collecting + feature normalization statistics. + Default is 100. + :type normalizer_num_samples: int + :param max_duration: Audio clips with duration (in seconds) greater than + this will be discarded. Default is 20.0. + :type max_duration: float + :param min_duration: Audio clips with duration (in seconds) smaller than + this will be discarded. Default is 0.0. + :type min_duration: float + :param stride_ms: Striding size (in milliseconds) for generating frames. + Default is 10.0. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. + :type window_ms: float + :param max_frequency: Maximun frequency for FFT features. FFT features of + frequency larger than this will be discarded. + If set None, all features will be kept. + Default is None. + :type max_frequency: float + """ + + def __init__(self, + vocab_filepath, + normalizer_manifest_path, + normalizer_num_samples=100, + max_duration=20.0, + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_frequency=None): + self.__max_duration__ = max_duration + self.__min_duration__ = min_duration + self.__stride_ms__ = stride_ms + self.__window_ms__ = window_ms + self.__max_frequency__ = max_frequency + self.__random__ = random.Random(RANDOM_SEED) + # load vocabulary (dictionary) + self.__vocab_dict__, self.__vocab_list__ = \ + self.__load_vocabulary_from_file__(vocab_filepath) + # collect normalizer statistics + self.__mean__, self.__std__ = self.__collect_normalizer_statistics__( + manifest_path=normalizer_manifest_path, + num_samples=normalizer_num_samples) + + def __audio_featurize__(self, audio_filename): + """ + Preprocess audio data, including feature extraction, normalization etc.. + """ + features = self.__audio_basic_featurize__(audio_filename) + return self.__normalize__(features) + + def __text_featurize__(self, text): + """ + Preprocess text data, including tokenizing and token indexing etc.. + """ + return self.__convert_text_to_char_index__( + text=text, vocabulary=self.__vocab_dict__) + + def __audio_basic_featurize__(self, audio_filename): + """ + Compute basic (without normalization etc.) features for audio data. + """ + return self.__spectrogram_from_file__( + filename=audio_filename, + stride_ms=self.__stride_ms__, + window_ms=self.__window_ms__, + max_freq=self.__max_frequency__) + + def __collect_normalizer_statistics__(self, manifest_path, num_samples=100): + """ + Compute feature normalization statistics, i.e. mean and stddev. + """ + # read manifest + manifest = self.__read_manifest__( + manifest_path=manifest_path, + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + # sample for statistics + sampled_manifest = self.__random__.sample(manifest, num_samples) + # extract spectrogram feature + features = [] + for instance in sampled_manifest: + spectrogram = self.__audio_basic_featurize__( + instance["audio_filepath"]) + features.append(spectrogram) + features = np.hstack(features) + mean = np.mean(features, axis=1).reshape([-1, 1]) + std = np.std(features, axis=1).reshape([-1, 1]) + return mean, std + + def __normalize__(self, features, eps=1e-14): + """ + Normalize features to be of zero mean and unit stddev. + """ + return (features - self.__mean__) / (self.__std__ + eps) + + def __spectrogram_from_file__(self, + filename, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """ + Laod audio data and calculate the log of spectrogram by FFT. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + audio, sample_rate = soundfile.read(filename) + if audio.ndim >= 2: + audio = np.mean(audio, 1) + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + spectrogram, freqs = self.__extract_spectrogram__( + audio, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(spectrogram[:ind, :] + eps) + + def __extract_spectrogram__(self, samples, window_size, stride_size, + sample_rate): + """ + Compute the spectrogram by FFT for a discrete real signal. + Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech + """ + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft)**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs + + def __load_vocabulary_from_file__(self, vocabulary_path): + """ + Load vocabulary from file. + """ + if not os.path.exists(vocabulary_path): + raise ValueError("Vocabulary file %s not found.", vocabulary_path) + vocab_lines = [] + with open(vocabulary_path, 'r') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list + + def __convert_text_to_char_index__(self, text, vocabulary): + """ + Convert text string to a list of character index integers. + """ + return [vocabulary[w] for w in text] + + def __read_manifest__(self, manifest_path, max_duration, min_duration): + """ + Load and parse manifest file. + """ + manifest = [] + for json_line in open(manifest_path): + try: + json_data = json.loads(json_line) + except Exception as e: + raise ValueError("Error reading manifest: %s" % str(e)) + if (json_data["duration"] <= max_duration and + json_data["duration"] >= min_duration): + manifest.append(json_data) + return manifest + + def __padding_batch__(self, batch, padding_to=-1, flatten=False): + """ + Padding audio part of features (only in the time axis -- column axis) + with zeros, to make each instance in the batch share the same + audio feature shape. + + If `padding_to` is set -1, the maximun column numbers in the batch will + be used as the target size. Otherwise, `padding_to` will be the target + size. Default is -1. + + If `flatten` is set True, audio data will be flatten to be a 1-dim + ndarray. Default is False. + """ + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be greater" + " or equal to the original instance length.") + max_length = padding_to + # padding + for audio, text in batch: + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + if flatten: + padded_audio = padded_audio.flatten() + new_batch.append((padded_audio, text)) + return new_batch + + def instance_reader_creator(self, + manifest_path, + sort_by_duration=True, + shuffle=False): + """ + Instance reader creator for audio data. Creat a callable function to + produce instances of data. + + Instance: a tuple of a numpy ndarray of audio spectrogram and a list of + tokenized and indexed transcription text. + + :param manifest_path: Filepath of manifest for audio clip files. + :type manifest_path: basestring + :param sort_by_duration: Sort the audio clips by duration if set True + (for SortaGrad). + :type sort_by_duration: bool + :param shuffle: Shuffle the audio clips if set True. + :type shuffle: bool + :return: Data reader function. + :rtype: callable + """ + if sort_by_duration and shuffle: + sort_by_duration = False + logger.warn("When shuffle set to true, " + "sort_by_duration is forced to set False.") + + def reader(): + # read manifest + manifest = self.__read_manifest__( + manifest_path=manifest_path, + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + # sort (by duration) or shuffle manifest + if sort_by_duration: + manifest.sort(key=lambda x: x["duration"]) + if shuffle: + self.__random__.shuffle(manifest) + # extract spectrogram feature + for instance in manifest: + spectrogram = self.__audio_featurize__( + instance["audio_filepath"]) + transcript = self.__text_featurize__(instance["text"]) + yield (spectrogram, transcript) + + return reader + + def batch_reader_creator(self, + manifest_path, + batch_size, + padding_to=-1, + flatten=False, + sort_by_duration=True, + shuffle=False): + """ + Batch data reader creator for audio data. Creat a callable function to + produce batches of data. + + Audio features will be padded with zeros to make each instance in the + batch to share the same audio feature shape. + + :param manifest_path: Filepath of manifest for audio clip files. + :type manifest_path: basestring + :param batch_size: Instance number in a batch. + :type batch_size: int + :param padding_to: If set -1, the maximun column numbers in the batch + will be used as the target size for padding. + Otherwise, `padding_to` will be the target size. + Default is -1. + :type padding_to: int + :param flatten: If set True, audio data will be flatten to be a 1-dim + ndarray. Otherwise, 2-dim ndarray. Default is False. + :type flatten: bool + :param sort_by_duration: Sort the audio clips by duration if set True + (for SortaGrad). + :type sort_by_duration: bool + :param shuffle: Shuffle the audio clips if set True. + :type shuffle: bool + :return: Batch reader function, producing batches of data when called. + :rtype: callable + """ + + def batch_reader(): + instance_reader = self.instance_reader_creator( + manifest_path=manifest_path, + sort_by_duration=sort_by_duration, + shuffle=shuffle) + batch = [] + for instance in instance_reader(): + batch.append(instance) + if len(batch) == batch_size: + yield self.__padding_batch__(batch, padding_to, flatten) + batch = [] + if len(batch) > 0: + yield self.__padding_batch__(batch, padding_to, flatten) + + return batch_reader + + def vocabulary_size(self): + """ + Get vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return len(self.__vocab_list__) + + def vocabulary_dict(self): + """ + Get vocabulary in dict. + + :return: Vocabulary in dict. + :rtype: dict + """ + return self.__vocab_dict__ + + def vocabulary_list(self): + """ + Get vocabulary in list. + + :return: Vocabulary in list + :rtype: list + """ + return self.__vocab_list__ + + def data_name_feeding(self): + """ + Get feeddings (data field name and corresponding field id). + + :return: Feeding dict. + :rtype: dict + """ + feeding = { + "audio_spectrogram": 0, + "transcript_text": 1, + } + return feeding diff --git a/deep_speech_2/data/eng_vocab.txt b/deep_speech_2/data/eng_vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..8268f3f3301047f2b4354d60a4bd1d5ef58619a2 --- /dev/null +++ b/deep_speech_2/data/eng_vocab.txt @@ -0,0 +1,28 @@ +' + +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z diff --git a/deep_speech_2/data/librispeech.py b/deep_speech_2/data/librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..653caa9267b62aa8415a26be2143de874bb15e88 --- /dev/null +++ b/deep_speech_2/data/librispeech.py @@ -0,0 +1,175 @@ +""" + Download, unpack and create manifest json files for the Librespeech dataset. + + A manifest is a json file summarizing filelist in a data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file in the data set. +""" + +import paddle.v2 as paddle +from paddle.v2.dataset.common import md5file +import distutils.util +import os +import wget +import tarfile +import argparse +import soundfile +import json + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = "http://www.openslr.org/resources/12" +URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" +URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" +URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz" +URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz" +URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" +URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" + +MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" +MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" +MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" +MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" +MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" + +parser = argparse.ArgumentParser( + description='Downloads and prepare LibriSpeech dataset.') +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Libri", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest.libri", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") +args = parser.parse_args() + + +def download(url, md5sum, target_dir): + """ + Download file from url to target_dir, and check md5sum. + """ + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir): + """ + Unpack the file to the target_dir. + """ + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + + +def create_manifest(data_dir, manifest_path): + """ + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(data_dir, subfolder, text_filelist[0]) + for line in open(text_filepath): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(data_dir, subfolder, + segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with open(manifest_path, 'w') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """ + Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + prepare_dataset( + url=URL_TEST_CLEAN, + md5sum=MD5_TEST_CLEAN, + target_dir=os.path.join(args.target_dir, "test-clean"), + manifest_path=args.manifest_prefix + ".test-clean") + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + prepare_dataset( + url=URL_TRAIN_CLEAN_100, + md5sum=MD5_TRAIN_CLEAN_100, + target_dir=os.path.join(args.target_dir, "train-clean-100"), + manifest_path=args.manifest_prefix + ".train-clean-100") + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other") + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other") + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360") + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500") + + +if __name__ == '__main__': + main() diff --git a/deep_speech_2/decoder.py b/deep_speech_2/decoder.py new file mode 100755 index 0000000000000000000000000000000000000000..7c4b952636f3e94167bbd00880673a8dc5635803 --- /dev/null +++ b/deep_speech_2/decoder.py @@ -0,0 +1,60 @@ +""" + CTC-like decoder utilitis. +""" + +from itertools import groupby +import numpy as np + + +def ctc_best_path_decode(probs_seq, vocabulary): + """ + Best path decoding, also called argmax decoding or greedy decoding. + Path consisting of the most probable tokens are further post-processed to + remove consecutive repetitions and all blanks. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: baseline + """ + # dimension verification + for probs in probs_seq: + if not len(probs) == len(vocabulary) + 1: + raise ValueError("probs_seq dimension mismatchedd with vocabulary") + # argmax to get the best index for each time step + max_index_list = list(np.array(probs_seq).argmax(axis=1)) + # remove consecutive duplicate indexes + index_list = [index_group[0] for index_group in groupby(max_index_list)] + # remove blank indexes + blank_index = len(vocabulary) + index_list = [index for index in index_list if index != blank_index] + # convert index list to string + return ''.join([vocabulary[index] for index in index_list]) + + +def ctc_decode(probs_seq, vocabulary, method): + """ + CTC-like sequence decoding from a sequence of likelihood probablilites. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param method: Decoding method name, with options: "best_path". + :type method: basestring + :return: Decoding result string. + :rtype: baseline + """ + for prob_list in probs_seq: + if not len(prob_list) == len(vocabulary) + 1: + raise ValueError("probs dimension mismatchedd with vocabulary") + if method == "best_path": + return ctc_best_path_decode(probs_seq, vocabulary) + else: + raise ValueError("Decoding method [%s] is not supported.") diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..598c348b063c6b5fb98bd6f3b287f95d64ef121e --- /dev/null +++ b/deep_speech_2/infer.py @@ -0,0 +1,137 @@ +""" + Inference for a simplifed version of Baidu DeepSpeech2 model. +""" + +import paddle.v2 as paddle +import distutils.util +import argparse +import gzip +from audio_data_utils import DataGenerator +from model import deep_speech2 +from decoder import ctc_decode + +parser = argparse.ArgumentParser( + description='Simplified version of DeepSpeech2 inference.') +parser.add_argument( + "--num_samples", + default=10, + type=int, + help="Number of samples for inference. (default: %(default)s)") +parser.add_argument( + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--normalizer_manifest_path", + default='data/manifest.libri.train-clean-100', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--decode_manifest_path", + default='data/manifest.libri.test-clean', + type=str, + help="Manifest path for decoding. (default: %(default)s)") +parser.add_argument( + "--model_filepath", + default='./params.tar.gz', + type=str, + help="Model filepath. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='data/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") +args = parser.parse_args() + + +def infer(): + """ + Max-ctc-decoding for DeepSpeech2. + """ + # initialize data generator + data_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + + # create network config + dict_size = data_generator.vocabulary_size() + vocab_list = data_generator.vocabulary_list() + audio_data = paddle.layer.data( + name="audio_spectrogram", + height=161, + width=2000, + type=paddle.data_type.dense_vector(322000)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(dict_size)) + output_probs = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + is_inference=True) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.model_filepath)) + + # prepare infer data + feeding = data_generator.data_name_feeding() + test_batch_reader = data_generator.batch_reader_creator( + manifest_path=args.decode_manifest_path, + batch_size=args.num_samples, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=False) + infer_data = test_batch_reader().next() + + # run inference + infer_results = paddle.infer( + output_layer=output_probs, parameters=parameters, input=infer_data) + num_steps = len(infer_results) / len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) + ] + + # decode and print + for i, probs in enumerate(probs_split): + output_transcription = ctc_decode( + probs_seq=probs, vocabulary=vocab_list, method="best_path") + target_transcription = ''.join( + [vocab_list[index] for index in infer_data[i][1]]) + print("Target Transcription: %s \nOutput Transcription: %s \n" % + (target_transcription, output_transcription)) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + infer() + + +if __name__ == '__main__': + main() diff --git a/deep_speech_2/model.py b/deep_speech_2/model.py new file mode 100644 index 0000000000000000000000000000000000000000..13ff829b9a6b947253a40a1d3ea524de141bd9d1 --- /dev/null +++ b/deep_speech_2/model.py @@ -0,0 +1,144 @@ +""" + A simplifed version of Baidu DeepSpeech2 model. +""" + +import paddle.v2 as paddle + +#TODO: add bidirectional rnn. + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + """ + Convolution layer with batch normalization. + """ + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectional_simple_rnn_bn_layer(name, input, size, act): + """ + Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + """ + # input-hidden weights shared across bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + + +def conv_group(input, num_stacks): + """ + Convolution group with several stacking convolution layers. + """ + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + output_num_channels = 32 + output_height = 160 // pow(2, num_stacks) + 1 + return conv, output_num_channels, output_height + + +def rnn_group(input, size, num_stacks): + """ + RNN group with several stacking RNN layers. + """ + output = input + for i in xrange(num_stacks): + output = bidirectional_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256, + is_inference=False): + """ + The whole DeepSpeech2 model structure (a simplified version). + + :param audio_data: Audio spectrogram data layer. + :type audio_data: LayerOutput + :param text_data: Transcription text data layer. + :type text_data: LayerOutput + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (number of RNN cells). + :type rnn_size: int + :param is_inference: False in the training mode, and True in the + inferene mode. + :type is_inference: bool + :return: If is_inference set False, return a ctc cost layer; + if is_inference set True, return a sequence layer of output + probability distribution. + :rtype: tuple of LayerOutput + """ + # convolution group + conv_group_output, conv_group_num_channels, conv_group_height = conv_group( + input=audio_data, num_stacks=num_conv_layers) + # convert data form convolution feature map to sequence of vectors + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=conv_group_num_channels, + stride_x=1, + stride_y=1, + block_x=1, + block_y=conv_group_height) + # rnn group + rnn_group_output = rnn_group( + input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + if is_inference: + # probability distribution with softmax + return paddle.layer.mixed( + input=paddle.layer.identity_projection(input=fc), + act=paddle.activation.Softmax()) + else: + # ctc cost + return paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..58a93debe49fca93e5df72164a5f8cf31291f0bd --- /dev/null +++ b/deep_speech_2/requirements.txt @@ -0,0 +1,2 @@ +SoundFile==0.9.0.post1 +wget==3.2 diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py new file mode 100644 index 0000000000000000000000000000000000000000..89ab23c685888d86956a1c2b798447a9e79c8a47 --- /dev/null +++ b/deep_speech_2/train.py @@ -0,0 +1,208 @@ +""" + Trainer for a simplifed version of Baidu DeepSpeech2 model. +""" + +import paddle.v2 as paddle +import distutils.util +import argparse +import gzip +import time +import sys +from model import deep_speech2 +from audio_data_utils import DataGenerator +import numpy as np +import os + +#TODO: add WER metric + +parser = argparse.ArgumentParser( + description='Simplified version of DeepSpeech2 trainer.') +parser.add_argument( + "--batch_size", default=32, type=int, help="Minibatch size.") +parser.add_argument( + "--num_passes", + default=20, + type=int, + help="Training pass number. (default: %(default)s)") +parser.add_argument( + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--adam_learning_rate", + default=5e-4, + type=float, + help="Learning rate for ADAM Optimizer. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--use_sortagrad", + default=False, + type=distutils.util.strtobool, + help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=4, + type=int, + help="Trainer number. (default: %(default)s)") +parser.add_argument( + "--normalizer_manifest_path", + default='data/manifest.libri.train-clean-100', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--train_manifest_path", + default='data/manifest.libri.train-clean-100', + type=str, + help="Manifest path for training. (default: %(default)s)") +parser.add_argument( + "--dev_manifest_path", + default='data/manifest.libri.dev-clean', + type=str, + help="Manifest path for validation. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='data/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--init_model_path", + default=None, + type=str, + help="If set None, the training will start from scratch. " + "Otherwise, the training will resume from " + "the existing model of this path. (default: %(default)s)") +args = parser.parse_args() + + +def train(): + """ + DeepSpeech2 training. + """ + # initialize data generator + data_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + + # create network config + dict_size = data_generator.vocabulary_size() + audio_data = paddle.layer.data( + name="audio_spectrogram", + height=161, + width=2000, + type=paddle.data_type.dense_vector(322000)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(dict_size)) + cost = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + is_inference=False) + + # create/load parameters and optimizer + if args.init_model_path is None: + parameters = paddle.parameters.create(cost) + else: + if not os.path.isfile(args.init_model_path): + raise IOError("Invalid model!") + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.init_model_path)) + optimizer = paddle.optimizer.Adam( + learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + # prepare data reader + train_batch_reader_sortagrad = data_generator.batch_reader_creator( + manifest_path=args.train_manifest_path, + batch_size=args.batch_size, + padding_to=2000, + flatten=True, + sort_by_duration=True, + shuffle=False) + train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + manifest_path=args.train_manifest_path, + batch_size=args.batch_size, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=True) + test_batch_reader = data_generator.batch_reader_creator( + manifest_path=args.dev_manifest_path, + batch_size=args.batch_size, + padding_to=2000, + flatten=True, + sort_by_duration=False, + shuffle=False) + feeding = data_generator.data_name_feeding() + + # create event handler + def event_handler(event): + global start_time, cost_sum, cost_counter + if isinstance(event, paddle.event.EndIteration): + cost_sum += event.cost + cost_counter += 1 + if event.batch_id % 50 == 0: + print "\nPass: %d, Batch: %d, TrainCost: %f" % ( + event.pass_id, event.batch_id, cost_sum / cost_counter) + cost_sum, cost_counter = 0.0, 0 + with gzip.open("params.tar.gz", 'w') as f: + parameters.to_tar(f) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.BeginPass): + start_time = time.time() + cost_sum, cost_counter = 0.0, 0 + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=test_batch_reader, feeding=feeding) + print "\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % ( + time.time() - start_time, event.pass_id, result.cost) + + # run train + # first pass with sortagrad + if args.use_sortagrad: + trainer.train( + reader=train_batch_reader_sortagrad, + event_handler=event_handler, + num_passes=1, + feeding=feeding) + args.num_passes -= 1 + # other passes without sortagrad + trainer.train( + reader=train_batch_reader_nosortagrad, + event_handler=event_handler, + num_passes=args.num_passes, + feeding=feeding) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + train() + + +if __name__ == '__main__': + main() diff --git a/ltr/index.html b/ltr/index.html new file mode 100644 index 0000000000000000000000000000000000000000..cce283e33d1865f862578ae17e1dc54a54e9ba72 --- /dev/null +++ b/ltr/index.html @@ -0,0 +1,418 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/nce_cost/README.md b/nce_cost/README.md index a0990367ef8b03c70c29d285e22ef85907e1d0b7..fce8bdaf80501e5bed650e93efc6c438284031c9 100644 --- a/nce_cost/README.md +++ b/nce_cost/README.md @@ -1 +1,115 @@ -TBD +# 噪声对比估计加速词向量训练 +## 背景介绍 +在自然语言处理领域中,通常使用特征向量来表示一个单词,但是如何使用准确的词向量来表示语义却是一个难点,详细内容可以在[词向量章节](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)中查阅到,原作者使用神经概率语言模型(Neural Probabilistic Language Model, NPLM)来训练词向量,尽管 NPLM 有优异的精度表现,但是相对于传统的 N-gram 统计模型,训练时间还是太漫长了\[[3](#参考文献)\]。常用的优化这个问题算法主要有两个:一个是 hierarchical-sigmoid \[[2](#参考文献)\] 另一个 噪声对比估计(Noise-contrastive estimation, NCE)\[[1](#参考文献)\]。为了克服这个问题本文引入了 NCE 方法。本文将以训练 NPLM 作为例子来讲述如何使用 NCE。 + +## NCE 概览 +NCE 是一种快速对离散分布进行估计的方法,应用到本文中的问题:训练 NPLM 计算开销很大,原因是 softmax 函数计算时需要考虑每个类别的指数项,必须计算字典中的所有单词,而在一般语料集上面字典往往非常大\[[3](#参考文献)\],从而导致整个训练过程十分耗时。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比,NCE 不再使用复杂的二叉树来构造目标函数,而是采用相对简单的随机负采样,以大幅提升计算效率。 + + +假设已知具体的上下文 $h$,并且知道这个分布为 $P^h(w)$ ,并将从中抽样出来的数据作为正样例,而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布,默认为无偏的均匀分布。这里我们同时假设噪音样例 k 倍于数据样例,则训练数据被抽中的概率为\[[1](#参考文献)\]: + +$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$ + +其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ,$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量,整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]: + +$$ +J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) } \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$ +$$ + \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) } \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h))) \right]$$ + +总体上来说,NCE 是通过构造逻辑回归(logistic regression),对正样例和负样例做二分类,对于每一个样本,将自身的预测词 label 作为正样例,同时采样出 $k$ 个其他词 label 作为负样例,从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 softmax 分类需要计算每个类别的分数,然后归一化得到概率,节约了大量的时间消耗。 + +## 实验数据 +本文采用 Penn Treebank (PTB) 数据集([Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz))来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据,如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作,方便后期处理。语料语种为英文,共有42068句训练数据,3761句测试数据。 + +## 网络结构 +N-gram 神经概率语言模型详细网络结构见图1: + +

+
+图1. 网络配置结构 +

+可以看到,模型主要分为如下几个部分构成: + +1. **输入层**:输入的 ptb 样本由原始的英文单词组成,将每个英文单词转换为字典中的 id 表示,使用唯一的 id 表示可以区分每个单词。 + +2. **词向量层**:比起原先的 id 表示,词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵,将原先的 id 表示转换为固定维度的词向量表示。训练完成之后,词语之间的语义相似度可以使用词向量之间的距离来表示,语义越相似,距离越近。 + +3. **词向量拼接层**:将词向量进行串联,并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。 + +4. **全连接隐层**:将上一层获得的长向量输入到一层隐层的神经网络,输出特征向量。全连接的隐层可以增强网络的学习能力。 + +5. **NCE层**:训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。 + + +## 训练阶段 +训练直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集,如果未包含,则自动下载。运行过程中,每1000个 iteration 会打印模型训练信息,主要包含训练损失,每个 pass 会计算测试数据集上的损失,并同时会保存最新的模型快照。在 PaddlePaddle 中有已经实现好的 NCE Layer,一些参数需要自行根据实际场景进行设计,可参考的调参方案如下: + + +| 参数名 | 参数作用 | 介绍 | +|:------ |:-------| :--------| +| param\_attr / bias\_attr | 用来设置参数名字 | 可以方便后面预测阶段好来实现网络的参数共享,具体内容在下一个章节里会陈述。| +| num\_neg\_samples | 参数负责控制对负样例的采样个数。 | 可以控制正负样本比例,这个值取值区间为 [1, 字典大小-1],负样本个数越多则整个模型的训练速度越慢,模型精度也会越高 | +| neg\_distribution | 控制生成负样例标签的分布,默认是一个均匀分布。 | 可以自行控制负样本采样时各个类别的采样权重,比如希望正样例为“晴天”时,负样例“洪水”在训练时更被着重区分,则可以将“洪水”这个类别的采样权重增加。 | +| act | 表示使用何种激活函数。 | 根据 NCE 的原理,这里应该使用 sigmoid 函数。 | + + +具体代码实现如下: + +```python +cost = paddle.layer.nce( + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name='nce_w'), + bias_attr=paddle.attr.Param(name='nce_b'), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) +``` + + +## 预测阶段 +预测直接运行` python infer.py `,程序首先会加载最新模型,然后按照 batch 大小依次进行预测,并打印预测结果。因为训练和预测计算逻辑不一样,预测阶段需要共享 NCE Layer 中的逻辑回归训练时得到的参数,所以要写一个推断层,推断层的参数为预先训练好的参数。 + +具体实现推断层的方法:先是通过 `paddle.attr.Param` 方法获取参数值,然后使用 `paddle.layer.trans_full_matrix_projection` 对隐层输出向量 `hidden_layer` 做一个矩阵右乘,PaddlePaddle 会自行在模型中寻找相同参数名的参数并获取。右乘求和后得到类别向量,将类别向量输入 softmax 做一个归一操作,和为1,从而得到最后的类别概率分布。 + +代码实现如下: + +```python +with paddle.layer.mixed( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name='nce_b')) as prediction: + prediction += paddle.layer.trans_full_matrix_projection( + input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) +``` + +预测的输出形式为: + +``` +-------------------------- +No.68 Input: ' for possible +Ground Truth Output: +Predict Output: + +-------------------------- +No.69 Input: for possible +Ground Truth Output: on +Predict Output: + +-------------------------- +No.70 Input: for possible on +Ground Truth Output: the +Predict Output: the + +``` + +每一个短线表示一次的预测,第二行显示第几条测试样例,并给出输入的4个单词,第三行为真实的标签,第四行为预测的标签。 + +## 参考文献 +1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273. + +2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252). + +3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758. diff --git a/nce_cost/images/network_conf.png b/nce_cost/images/network_conf.png new file mode 100644 index 0000000000000000000000000000000000000000..749f8a365db1e1c18d829a460de7c45b27892d19 Binary files /dev/null and b/nce_cost/images/network_conf.png differ diff --git a/nce_cost/infer.py b/nce_cost/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..53e3aef45fc02ac008caa7102836ac47915be1fc --- /dev/null +++ b/nce_cost/infer.py @@ -0,0 +1,70 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import glob +import gzip +import paddle.v2 as paddle +from nce_conf import network_conf + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + prediction_layer = network_conf( + is_train=False, + hidden_size=128, + embedding_size=512, + dict_size=dict_size) + + models_list = glob.glob('./models/*') + models_list = sorted(models_list) + + with gzip.open(models_list[-1], 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + idx_word_dict = dict((v, k) for k, v in word_dict.items()) + batch_size = 64 + batch_ins = [] + ins_iter = paddle.dataset.imikolov.test(word_dict, 5) + + infer_data = [] + infer_data_label = [] + for item in paddle.dataset.imikolov.test(word_dict, 5)(): + infer_data.append((item[:4])) + infer_data_label.append(item[4]) + # Choose 100 samples from the test set to show how to infer. + if len(infer_data_label) == 100: + break + + feeding = { + 'firstw': 0, + 'secondw': 1, + 'thirdw': 2, + 'fourthw': 3, + 'fifthw': 4 + } + + predictions = paddle.infer( + output_layer=prediction_layer, + parameters=parameters, + input=infer_data, + feeding=feeding, + field=['value']) + + for i, (prob, data, + label) in enumerate(zip(predictions, infer_data, infer_data_label)): + print '--------------------------' + print "No.%d Input: " % (i+1) + \ + idx_word_dict[data[0]] + ' ' + \ + idx_word_dict[data[1]] + ' ' + \ + idx_word_dict[data[2]] + ' ' + \ + idx_word_dict[data[3]] + print 'Ground Truth Output: ' + idx_word_dict[label] + print 'Predict Output: ' + idx_word_dict[prob.argsort( + kind='heapsort', axis=0)[-1]] + print + + +if __name__ == '__main__': + main() diff --git a/nce_cost/nce_conf.py b/nce_cost/nce_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..962a9ccc80906bc2272245d0e297142397ffb024 --- /dev/null +++ b/nce_cost/nce_conf.py @@ -0,0 +1,61 @@ +# -*- encoding:utf-8 -*- +import math +import paddle.v2 as paddle + + +def network_conf(hidden_size, embedding_size, dict_size, is_train): + + first_word = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + second_word = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + third_word = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourth_word = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + next_word = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + embed_param_attr = paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) + first_embedding = paddle.layer.embedding( + input=first_word, size=embedding_size, param_attr=embed_param_attr) + second_embedding = paddle.layer.embedding( + input=second_word, size=embedding_size, param_attr=embed_param_attr) + third_embedding = paddle.layer.embedding( + input=third_word, size=embedding_size, param_attr=embed_param_attr) + fourth_embedding = paddle.layer.embedding( + input=fourth_word, size=embedding_size, param_attr=embed_param_attr) + + context_embedding = paddle.layer.concat(input=[ + first_embedding, second_embedding, third_embedding, fourth_embedding + ]) + + hidden_layer = paddle.layer.fc( + input=context_embedding, + size=hidden_size, + act=paddle.activation.Tanh(), + bias_attr=paddle.attr.Param(learning_rate=1), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embedding_size * 8), learning_rate=1)) + + if is_train == True: + cost = paddle.layer.nce( + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name='nce_w'), + bias_attr=paddle.attr.Param(name='nce_b'), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) + return cost + else: + with paddle.layer.mixed( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name='nce_b')) as prediction: + prediction += paddle.layer.trans_full_matrix_projection( + input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) + + return prediction diff --git a/nce_cost/train.py b/nce_cost/train.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b437c1dd9bfc89fd03598b9a4201693c3074d7 --- /dev/null +++ b/nce_cost/train.py @@ -0,0 +1,52 @@ +# -*- encoding:utf-8 -*- +import paddle.v2 as paddle +import gzip + +from nce_conf import network_conf + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + cost = network_conf( + is_train=True, hidden_size=128, embedding_size=512, dict_size=dict_size) + + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.Adam(learning_rate=1e-4) + trainer = paddle.trainer.SGD(cost, parameters, adagrad) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1000 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64)) + print "Test here.. Pass %d, Cost %f" % (event.pass_id, result.cost) + + model_name = "./models/model_pass_%05d.tar.gz" % event.pass_id + print "Save model into %s ..." % model_name + with gzip.open(model_name, 'w') as f: + parameters.to_tar(f) + + feeding = { + 'firstw': 0, + 'secondw': 1, + 'thirdw': 2, + 'fourthw': 3, + 'fifthw': 4 + } + + trainer.train( + paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64), + num_passes=1000, + event_handler=event_handler, + feeding=feeding) + + +if __name__ == '__main__': + main() diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html new file mode 100644 index 0000000000000000000000000000000000000000..d749ff5722aa4144743fdca45f2ac0418c9db0b3 --- /dev/null +++ b/nmt_without_attention/index.html @@ -0,0 +1,417 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/sequence_tagging_for_ner/index.html b/sequence_tagging_for_ner/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b7c6c8994abdbcd80ff7347960d984e5528311a1 --- /dev/null +++ b/sequence_tagging_for_ner/index.html @@ -0,0 +1,314 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/text_classification/index.html b/text_classification/index.html new file mode 100644 index 0000000000000000000000000000000000000000..3ee660d8471269bfebf2444fb7c4a97deb550561 --- /dev/null +++ b/text_classification/index.html @@ -0,0 +1,302 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/word_embedding/index.html b/word_embedding/index.html new file mode 100644 index 0000000000000000000000000000000000000000..83f6809d669d9ec6e0dd002f414ba8247068e270 --- /dev/null +++ b/word_embedding/index.html @@ -0,0 +1,227 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + +