diff --git a/.clang_format.hook b/.clang_format.hook index 9db4fe4550c44fdb60e48818841d99ba5a081f46..40d70f56cf97f7b7f18bb255dae73ab1d542f12a 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,5 +1,15 @@ -#!/bin/bash +#!/usr/bin/env bash +set -e -# clang-format hook without version check +readonly VERSION="3.8" + +version=$(clang-format -version) + +if ! [[ $version == *"$VERSION"* ]]; then + echo "clang-format version check failed." + echo "a version contains '$VERSION' is needed, but get '$version'" + echo "you can install the right version, and make an soft-link to '\$PATH' env" + exit -1 +fi clang-format $@ diff --git a/deep_speech_2/cloud/pcloud_submit.sh b/deep_speech_2/cloud/pcloud_submit.sh index 378a7c6e624624af2d3fd004ff41154204a21334..99e458db96b819019628a26f05b3597ea951aeea 100644 --- a/deep_speech_2/cloud/pcloud_submit.sh +++ b/deep_speech_2/cloud/pcloud_submit.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train" DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev" diff --git a/deep_speech_2/cloud/pcloud_train.sh b/deep_speech_2/cloud/pcloud_train.sh index 804f606a2bf604d2d36b599533b2af03ecfd7bbc..d0c47dece91c43d0cbfde1f6eb2dcc96fce36391 100644 --- a/deep_speech_2/cloud/pcloud_train.sh +++ b/deep_speech_2/cloud/pcloud_train.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash TRAIN_MANIFEST=$1 DEV_MANIFEST=$2 diff --git a/deep_speech_2/cloud/pcloud_upload_data.sh b/deep_speech_2/cloud/pcloud_upload_data.sh index 4ef235ef7da57e5e1f611ddad8b7000528ab46cc..71bb4af19b3b30f6efc31cb9b60f4f3b330b46b9 100644 --- a/deep_speech_2/cloud/pcloud_upload_data.sh +++ b/deep_speech_2/cloud/pcloud_upload_data.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash mkdir cloud_manifests diff --git a/deep_speech_2/deploy/demo_server.py b/deep_speech_2/deploy/demo_server.py index a7157001cf8ecf766329910350a51bea0f1c5275..7c5584191e73c3326943266ddaa59e369a284c88 100644 --- a/deep_speech_2/deploy/demo_server.py +++ b/deep_speech_2/deploy/demo_server.py @@ -100,7 +100,7 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler): finish_time = time.time() print("Response Time: %f, Transcript: %s" % (finish_time - start_time, transcript)) - self.request.sendall(transcript) + self.request.sendall(transcript.encode('utf-8')) def _write_to_file(self, data): # prepare save dir and filename diff --git a/deep_speech_2/examples/librispeech/run_data.sh b/deep_speech_2/examples/librispeech/run_data.sh index f65aa233b7868b587e1411a87c9e0a8141a94c91..bdd5abb5891c8af566ed889287248c2f207e59ba 100644 --- a/deep_speech_2/examples/librispeech/run_data.sh +++ b/deep_speech_2/examples/librispeech/run_data.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_infer.sh b/deep_speech_2/examples/librispeech/run_infer.sh index 6b790502a536e144d3add3f1f187d3f5e7282888..eb812440be0b106cfaee07e7a5e78310999f9845 100644 --- a/deep_speech_2/examples/librispeech/run_infer.sh +++ b/deep_speech_2/examples/librispeech/run_infer.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_infer_golden.sh b/deep_speech_2/examples/librispeech/run_infer_golden.sh index 679bd1bf8a3bffe2c96c27558c105a519a3dbf7d..eeccfdebbc5ab9b964a7b4eb506a7bda0f221ac3 100644 --- a/deep_speech_2/examples/librispeech/run_infer_golden.sh +++ b/deep_speech_2/examples/librispeech/run_infer_golden.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_test.sh b/deep_speech_2/examples/librispeech/run_test.sh index 9709234abcf8cb321f38da85183319ccb23d7ae6..7ef06ba9fd17318f153b6048bb59f05eaf16a076 100644 --- a/deep_speech_2/examples/librispeech/run_test.sh +++ b/deep_speech_2/examples/librispeech/run_test.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_test_golden.sh b/deep_speech_2/examples/librispeech/run_test_golden.sh index a505cdc79b92156b429b5adc64ab33f7279e6acc..86fe15306acce21268bf7e26b9d34aee9a31901d 100644 --- a/deep_speech_2/examples/librispeech/run_test_golden.sh +++ b/deep_speech_2/examples/librispeech/run_test_golden.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_train.sh b/deep_speech_2/examples/librispeech/run_train.sh index 07575dde1240c9491061941bbd9388bb2ab03432..9aa5e0d163b97e10d7b442c97ad786717a1637d1 100644 --- a/deep_speech_2/examples/librispeech/run_train.sh +++ b/deep_speech_2/examples/librispeech/run_train.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/librispeech/run_tune.sh b/deep_speech_2/examples/librispeech/run_tune.sh index 05c024becab92cf76648b08f1a886733fdb31635..abc28d36630e4e5465250331c2d336999744d571 100644 --- a/deep_speech_2/examples/librispeech/run_tune.sh +++ b/deep_speech_2/examples/librispeech/run_tune.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/mandarin/run_demo_client.sh b/deep_speech_2/examples/mandarin/run_demo_client.sh index dfde20f8800ffd9bb71ade7c100709adfb814861..bf8e545147233283738af467f4320759b8ac2d75 100644 --- a/deep_speech_2/examples/mandarin/run_demo_client.sh +++ b/deep_speech_2/examples/mandarin/run_demo_client.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/mandarin/run_demo_server.sh b/deep_speech_2/examples/mandarin/run_demo_server.sh index 703184a6be354e186bb9ff5fa7ceb03c082b7ca0..b0d4bc7f1179fdcb90e3ceef57ec346ba2b9d558 100644 --- a/deep_speech_2/examples/mandarin/run_demo_server.sh +++ b/deep_speech_2/examples/mandarin/run_demo_server.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash # TODO: replace the model with a mandarin model pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_data.sh b/deep_speech_2/examples/tiny/run_data.sh index 46266daaf66224393a0477c73bd3805330b77692..a98dab21439d3479c18710f8cb7b01ba67b2ca8f 100644 --- a/deep_speech_2/examples/tiny/run_data.sh +++ b/deep_speech_2/examples/tiny/run_data.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_infer.sh b/deep_speech_2/examples/tiny/run_infer.sh index 1d33bfbba2668d2f9cf5cb2518a5dcb26a34b3c0..dafc99d9c5828e8084cfc4a74a92395a0be09167 100644 --- a/deep_speech_2/examples/tiny/run_infer.sh +++ b/deep_speech_2/examples/tiny/run_infer.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_infer_golden.sh b/deep_speech_2/examples/tiny/run_infer_golden.sh index 32e9d8623fb53db8b1dc0f8129167c8d53d265d8..66360a6917d35140f9ad55190003c468ab511add 100644 --- a/deep_speech_2/examples/tiny/run_infer_golden.sh +++ b/deep_speech_2/examples/tiny/run_infer_golden.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_test.sh b/deep_speech_2/examples/tiny/run_test.sh index f9c3cc11ce1257bbef10211796757ce877b81db7..70cf4bfe2e4fc3f23b7ca96e2b161e4ff35443d9 100644 --- a/deep_speech_2/examples/tiny/run_test.sh +++ b/deep_speech_2/examples/tiny/run_test.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_test_golden.sh b/deep_speech_2/examples/tiny/run_test_golden.sh index 080c3c0622d62169d63f0e1f1bf3d9ceb7d24da0..e188c81b3ffecccd6331be2a4016b21de3ccc572 100644 --- a/deep_speech_2/examples/tiny/run_test_golden.sh +++ b/deep_speech_2/examples/tiny/run_test_golden.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_train.sh b/deep_speech_2/examples/tiny/run_train.sh index 74d82712e6e2e812da0b22552654d1ef7eadffbb..3c2b8a1e01ed9b67ec1fe420d3a7a74af8c73f4a 100644 --- a/deep_speech_2/examples/tiny/run_train.sh +++ b/deep_speech_2/examples/tiny/run_train.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/examples/tiny/run_tune.sh b/deep_speech_2/examples/tiny/run_tune.sh index 360c11d596db6c01ad76ab2c81a8aa10776f7cc4..926e9f8d5aecda7924c8a098d4a33f9c1c77a3dd 100644 --- a/deep_speech_2/examples/tiny/run_tune.sh +++ b/deep_speech_2/examples/tiny/run_tune.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash pushd ../.. > /dev/null diff --git a/deep_speech_2/model_utils/model.py b/deep_speech_2/model_utils/model.py index cf146f8ce988c528b8b61127562327e11aadff6b..09ee3c7615df6e87cd7f3fee9ae0ccd8bf9a9e3c 100644 --- a/deep_speech_2/model_utils/model.py +++ b/deep_speech_2/model_utils/model.py @@ -7,6 +7,7 @@ import sys import os import time import gzip +from distutils.dir_util import mkpath import paddle.v2 as paddle from model_utils.lm_scorer import LmScorer from model_utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder @@ -79,7 +80,7 @@ class DeepSpeech2Model(object): """ # prepare model output directory if not os.path.exists(output_model_dir): - os.mkdir(output_model_dir) + mkpath(output_model_dir) # prepare optimizer and trainer optimizer = paddle.optimizer.Adam( diff --git a/deep_speech_2/models/aishell/download_model.sh b/deep_speech_2/models/aishell/download_model.sh index 4368ee55af8c062c2ac5d7e1bcc56d086a186887..77fc84b5322d2301ae8bc8caabed7c27dd5f932d 100644 --- a/deep_speech_2/models/aishell/download_model.sh +++ b/deep_speech_2/models/aishell/download_model.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash source ../../utils/utility.sh diff --git a/deep_speech_2/models/librispeech/download_model.sh b/deep_speech_2/models/librispeech/download_model.sh index b5fcd7d8c133ea27d1f10d90b8d09e15821a220e..336502de87d77459063d1eaec8060a22a040b469 100644 --- a/deep_speech_2/models/librispeech/download_model.sh +++ b/deep_speech_2/models/librispeech/download_model.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash source ../../utils/utility.sh diff --git a/deep_speech_2/models/lm/download_lm_ch.sh b/deep_speech_2/models/lm/download_lm_ch.sh index 7f1c47a27641cb07e4ab638b2949e667abcc473d..46bfe9329949fb0b9d579e09c8b77bc68c73776a 100644 --- a/deep_speech_2/models/lm/download_lm_ch.sh +++ b/deep_speech_2/models/lm/download_lm_ch.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash source ../../utils/utility.sh diff --git a/deep_speech_2/models/lm/download_lm_en.sh b/deep_speech_2/models/lm/download_lm_en.sh index e967e25dc4c383276a1c8c93b3124081e80ad57b..fbfe647e9ece114f09fffee26aa427c489c9ee35 100644 --- a/deep_speech_2/models/lm/download_lm_en.sh +++ b/deep_speech_2/models/lm/download_lm_en.sh @@ -1,4 +1,4 @@ -#! /usr/bin/bash +#! /usr/bin/env bash source ../../utils/utility.sh diff --git a/deep_speech_2/setup.sh b/deep_speech_2/setup.sh index 6c8a709941ae94124149482f1886bf445c170af8..15c6e1e25ef19cca02a80d80af05ceeb55658d09 100644 --- a/deep_speech_2/setup.sh +++ b/deep_speech_2/setup.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#! /usr/bin/env bash # install python dependencies if [ -f "requirements.txt" ]; then diff --git a/dssm/README.cn.md b/dssm/README.cn.md new file mode 100644 index 0000000000000000000000000000000000000000..b65c11df7d00f34b8378c92371858ca383827a1d --- /dev/null +++ b/dssm/README.cn.md @@ -0,0 +1,502 @@ +# 深度结构化语义模型 (Deep Structured Semantic Models, DSSM) +DSSM使用DNN模型在一个连续的语义空间中学习文本低纬的表示向量,并且建模两个句子间的语义相似度。 +本例演示如何使用 PaddlePaddle实现一个通用的DSSM 模型,用于建模两个字符串间的语义相似度, +模型实现支持通用的数据格式,用户替换数据便可以在真实场景中使用该模型。 + +## 背景介绍 +DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义模型,用于学习两个文本之间的语义距离, +广义上模型也可以推广和适用如下场景: + +1. CTR预估模型,衡量用户搜索词(Query)与候选网页集合(Documents)之间的相关联程度。 +2. 文本相关性,衡量两个字符串间的语义相关程度。 +3. 自动推荐,衡量User与被推荐的Item之间的关联程度。 + +DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间的距离关系, +例如对于文本相关性问题,可以用余弦相似度 (cosin similarity) 来刻画语义距离; +而对于搜索引擎的结果排序,可以在DSSM上接上Rank损失训练处一个排序模型。 + +## 模型简介 +在原论文\[[1](#参考文献)\]中,DSSM模型用来衡量用户搜索词 Query 和文档集合 Documents 之间隐含的语义关系,模型结构如下 + +

+

+图 1. DSSM 原始结构 +

+ +其贯彻的思想是, **用DNN将高维特征向量转化为低纬空间的连续向量(图中红色框部分)** , +**在上层用cosin similarity来衡量用户搜索词与候选文档间的语义相关性** 。 + +在最顶层损失函数的设计上,原始模型使用类似Word2Vec中负例采样的方法, +一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失, +这也就是图 1中类似 $P(D_1|Q)$ 的结构,具体细节请参考原论文。 + +随着后续优化DSSM模型的结构得以简化\[[3](#参考文献)\],演变为: + +

+

+图 2. DSSM通用结构 +

+ +图中的空白方框可以用任何模型替代,比如全连接FC,卷积CNN,RNN等都可以, +该模型结构专门用于衡量两个元素(比如字符串)间的语义距离。 + +在现实使用中,DSSM模型会作为基础的积木,搭配上不同的损失函数来实现具体的功能,比如 + +- 在排序学习中,将 图 2 中结构添加 pairwise rank损失,变成一个排序模型 +- 在CTR预估中,对点击与否做0,1二元分类,添加交叉熵损失变成一个分类模型 +- 在需要对一个子串打分时,可以使用余弦相似度来计算相似度,变成一个回归模型 + +本例将尝试面向应用提供一个比较通用的解决方案,在模型任务类型上支持 + +- 分类 +- [-1, 1] 值域内的回归 +- Pairwise-Rank + +在生成低纬语义向量的模型结构上,本模型支持以下三种: + +- FC, 多层全连接层 +- CNN,卷积神经网络 +- RNN,递归神经网络 + +## 模型实现 +DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及顶层的损失函数。 +在复杂任务中,左右两边DNN的结构可以是不同的,比如在原始论文中左右分别学习Query和Document的semantic vector, +两者数据的数据不同,建议对应定制DNN的结构。 + +本例中为了简便和通用,将左右两个DNN的结构都设为相同的,因此只有三个选项FC,CNN,RNN等。 + +在损失函数的设计方面,也支持三种,分类, 回归, 排序; +其中,在回归和排序两种损失中,左右两边的匹配程度通过余弦相似度(cossim)来计算; +在分类任务中,类别预测的分布通过softmax计算。 + +在其它教程中,对上述很多内容都有过详细的介绍,例如: + +- 如何CNN, FC 做文本信息提取可以参考 [text classification](https://github.com/PaddlePaddle/models/blob/develop/text_classification/README.md#模型详解) +- RNN/GRU 的内容可以参考 [Machine Translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md#gated-recurrent-unit-gru) +- Pairwise Rank即排序学习可参考 [learn to rank](https://github.com/PaddlePaddle/models/blob/develop/ltr/README.md) + +相关原理在此不再赘述,本文接下来的篇幅主要集中介绍使用PaddlePaddle实现这些结构上。 + +如图3,回归和分类模型的结构很相似 + +

+

+图 3. DSSM for REGRESSION or CLASSIFICATION +

+ +最重要的组成部分包括词向量,图中`(1)`,`(2)`两个低纬向量的学习器(可以用RNN/CNN/FC中的任意一种实现), +最上层对应的损失函数。 + +而Pairwise Rank的结构会复杂一些,类似两个 图 4. 中的结构,增加了对应的损失函数: + +- 模型总体思想是,用同一个source(源)为左右两个target(目标)分别打分——`(a),(b)`,学习目标是(a),(b)间的大小关系 +- `(a)`和`(b)`类似图3中结构,用于给source和target的pair打分 +- `(1)`和`(2)`的结构其实是共用的,都表示同一个source,图中为了表达效果展开成两个 + +

+

+图 4. DSSM for Pairwise Rank +

+ +下面是各个部分具体的实现方法,所有的代码均包含在 `./network_conf.py` 中。 + + +### 创建文本的词向量表 + +```python +def create_embedding(self, input, prefix=''): + ''' + Create an embedding table whose name has a `prefix`. + ''' + logger.info("create embedding table [%s] which dimention is %d" % + (prefix, self.dnn_dims[0])) + emb = paddle.layer.embedding( + input=input, + size=self.dnn_dims[0], + param_attr=ParamAttr(name='%s_emb.w' % prefix)) + return emb +``` + +由于输入给词向量表(embedding table)的是一个句子对应的词的ID的列表 ,因此词向量表输出的是词向量的序列。 + +### CNN 结构实现 + +```python +def create_cnn(self, emb, prefix=''): + ''' + A multi-layer CNN. + + @emb: paddle.layer + output of the embedding layer + @prefix: str + prefix of layers' names, used to share parameters between more than one `cnn` parts. + ''' + def create_conv(context_len, hidden_size, prefix): + key = "%s_%d_%d" % (prefix, context_len, hidden_size) + conv = paddle.networks.sequence_conv_pool( + input=emb, + context_len=context_len, + hidden_size=hidden_size, + # set parameter attr for parameter sharing + context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), + fc_param_attr=ParamAttr(name=key + '_fc.w'), + fc_bias_attr=ParamAttr(name=key + '_fc.b'), + pool_bias_attr=ParamAttr(name=key + '_pool.b')) + return conv + + logger.info('create a sequence_conv_pool which context width is 3') + conv_3 = create_conv(3, self.dnn_dims[1], "cnn") + logger.info('create a sequence_conv_pool which context width is 4') + conv_4 = create_conv(4, self.dnn_dims[1], "cnn") + return conv_3, conv_4 +``` + +CNN 接受 embedding table输出的词向量序列,通过卷积和池化操作捕捉到原始句子的关键信息, +最终输出一个语义向量(可以认为是句子向量)。 + +本例的实现中,分别使用了窗口长度为3和4的CNN学到的句子向量按元素求和得到最终的句子向量。 + +### RNN 结构实现 + +RNN很适合学习变长序列的信息,使用RNN来学习句子的信息几乎是自然语言处理任务的标配。 + +```python +def create_rnn(self, emb, prefix=''): + ''' + A GRU sentence vector learner. + ''' + gru = paddle.layer.gru_memory(input=emb,) + sent_vec = paddle.layer.last_seq(gru) + return sent_vec +``` + +### FC 结构实现 + +```python +def create_fc(self, emb, prefix=''): + ''' + A multi-layer fully connected neural networks. + + @emb: paddle.layer + output of the embedding layer + @prefix: str + prefix of layers' names, used to share parameters between more than one `fc` parts. + ''' + _input_layer = paddle.layer.pooling( + input=emb, pooling_type=paddle.pooling.Max()) + fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) + return fc +``` + +在构建FC时需要首先使用`paddle.layer.pooling` 对词向量序列进行最大池化操作,将边长序列转化为一个固定维度向量, +作为整个句子的语义表达,使用最大池化能够降低句子长度对句向量表达的影响。 + +### 多层DNN实现 +在 CNN/DNN/FC提取出 semantic vector后,在上层可继续接多层FC来实现深层DNN结构。 + +```python +def create_dnn(self, sent_vec, prefix): + # if more than three layers exists, a fc layer will be added. + if len(self.dnn_dims) > 1: + _input_layer = sent_vec + for id, dim in enumerate(self.dnn_dims[1:]): + name = "%s_fc_%d_%d" % (prefix, id, dim) + logger.info("create fc layer [%s] which dimention is %d" % + (name, dim)) + fc = paddle.layer.fc( + input=_input_layer, + size=dim, + name=name, + act=paddle.activation.Tanh(), + param_attr=ParamAttr(name='%s.w' % name), + bias_attr=ParamAttr(name='%s.b' % name), + ) + _input_layer = fc + return _input_layer +``` + +### 分类或回归实现 +分类和回归的结构比较相似,因此可以用一个函数创建出来 + +```python +def _build_classification_or_regression_model(self, is_classification): + ''' + Build a classification/regression model, and the cost is returned. + + A Classification has 3 inputs: + - source sentence + - target sentence + - classification label + + ''' + # prepare inputs. + assert self.class_num + + source = paddle.layer.data( + name='source_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) + target = paddle.layer.data( + name='target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + label = paddle.layer.data( + name='label_input', + type=paddle.data_type.integer_value(self.class_num) + if is_classification else paddle.data_type.dense_input) + + prefixs = '_ _'.split( + ) if self.share_semantic_generator else 'left right'.split() + embed_prefixs = '_ _'.split( + ) if self.share_embed else 'left right'.split() + + word_vecs = [] + for id, input in enumerate([source, target]): + x = self.create_embedding(input, prefix=embed_prefixs[id]) + word_vecs.append(x) + + semantics = [] + for id, input in enumerate(word_vecs): + x = self.model_arch_creater(input, prefix=prefixs[id]) + semantics.append(x) + + concated_vector = paddle.layer.concat(semantics) + prediction = paddle.layer.fc( + input=concated_vector, + size=self.class_num, + act=paddle.activation.Softmax()) + cost = paddle.layer.classification_cost( + input=prediction, + label=label) if is_classification else paddle.layer.mse_cost( + prediction, label) + return cost, prediction, label +``` +### Pairwise Rank实现 +Pairwise Rank复用上面的DNN结构,同一个source对两个target求相似度打分, +如果左边的target打分高,预测为1,否则预测为 0。 + +```python +def _build_rank_model(self): + ''' + Build a pairwise rank model, and the cost is returned. + + A pairwise rank model has 3 inputs: + - source sentence + - left_target sentence + - right_target sentence + - label, 1 if left_target should be sorted in front of right_target, otherwise 0. + ''' + source = paddle.layer.data( + name='source_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) + left_target = paddle.layer.data( + name='left_target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + right_target = paddle.layer.data( + name='right_target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + label = paddle.layer.data( + name='label_input', type=paddle.data_type.integer_value(1)) + + prefixs = '_ _ _'.split( + ) if self.share_semantic_generator else 'source left right'.split() + embed_prefixs = '_ _'.split( + ) if self.share_embed else 'source target target'.split() + + word_vecs = [] + for id, input in enumerate([source, left_target, right_target]): + x = self.create_embedding(input, prefix=embed_prefixs[id]) + word_vecs.append(x) + + semantics = [] + for id, input in enumerate(word_vecs): + x = self.model_arch_creater(input, prefix=prefixs[id]) + semantics.append(x) + + # cossim score of source and left_target + left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) + # cossim score of source and right target + right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) + + # rank cost + cost = paddle.layer.rank_cost(left_score, right_score, label=label) + # prediction = left_score - right_score + # but this operator is not supported currently. + # so AUC will not used. + return cost, None, None +``` +## 数据格式 +在 `./data` 中有简单的示例数据 + +### 回归的数据格式 +``` +# 3 fields each line: +# - source's word ids +# - target's word ids +# - target + \t \t +``` + +比如: + +``` +3 6 10 \t 6 8 33 \t 0.7 +6 0 \t 6 9 330 \t 0.03 +``` +### 分类的数据格式 +``` +# 3 fields each line: +# - source's word ids +# - target's word ids +# - target + \t \t