From 4f3aebce1a6effb5ee697c36a7ffadd622e466cc Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 28 May 2020 04:38:59 +0000 Subject: [PATCH] add sim_net model --- examples/similarity_net/README.md | 199 ++++++++ examples/similarity_net/config.py | 60 +++ examples/similarity_net/download.py | 154 +++++++ examples/similarity_net/download_data.sh | 5 + examples/similarity_net/nets/bow.py | 115 +++++ examples/similarity_net/nets/cnn.py | 115 +++++ .../similarity_net/nets/losses/hinge_loss.py | 39 ++ .../similarity_net/nets/losses/log_loss.py | 32 ++ .../nets/losses/softmax_cross_entropy_loss.py | 31 ++ examples/similarity_net/reader.py | 280 ++++++++++++ examples/similarity_net/run.sh | 101 +++++ examples/similarity_net/run_classifier.py | 426 ++++++++++++++++++ examples/similarity_net/utils.py | 244 ++++++++++ 13 files changed, 1801 insertions(+) create mode 100644 examples/similarity_net/README.md create mode 100644 examples/similarity_net/config.py create mode 100644 examples/similarity_net/download.py create mode 100644 examples/similarity_net/download_data.sh create mode 100644 examples/similarity_net/nets/bow.py create mode 100644 examples/similarity_net/nets/cnn.py create mode 100644 examples/similarity_net/nets/losses/hinge_loss.py create mode 100644 examples/similarity_net/nets/losses/log_loss.py create mode 100644 examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py create mode 100644 examples/similarity_net/reader.py create mode 100644 examples/similarity_net/run.sh create mode 100644 examples/similarity_net/run_classifier.py create mode 100644 examples/similarity_net/utils.py diff --git a/examples/similarity_net/README.md b/examples/similarity_net/README.md new file mode 100644 index 0000000..4f7270b --- /dev/null +++ b/examples/similarity_net/README.md @@ -0,0 +1,199 @@ +# 短文本语义匹配 +## 简介 +### 任务说明 +短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。 + +### 效果说明 +基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和进行评测,效果如下表所示。 + + +| 模型 | 百度知道 | ECOM |QQSIM | UNICOM | +|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| +| | AUC | AUC | AUC|正逆序比| +|BOW_Pairwise|0.6815|0.7331|0.7638|1.5565| + + +#### 测试集说明 +| 数据集 | 来源 | 垂类 | +|:-----------:|:-------------:|:-------------:| +|百度知道 | 百度知道问题 | 日常 | +|ECOM|商业问句|金融| +|QQSIM|闲聊对话|日常| +|UNICOM|联通客服|客服| +## 快速开始 +#### 版本依赖 + +本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + + +#### 安装代码 +克隆工具集代码库到本地 +```shell +git clone https://github.com/PaddlePaddle/models.git + +cd models/dygraph/similarity_net +``` +#### 数据准备 +下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 + +```shell +sh download_data.sh +``` +或者 +``` +python download.py dataset +``` +#### 模型准备 +我们开源了基于大规模数据训练好的```pairwise```模型(基于bow模型训练),用户可以通过运行命令下载预训练好的模型,该模型将保存在```./model_files/simnet_bow_pairwise_pretrained_model/```下。 +```shell +sh download_pretrained_model.sh +``` +或者 + +``` +python download.py model +``` + +#### 评估 +我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。 +```shell +sh evaluate_ecom.sh +sh evaluate_qqsim.sh +sh evaluate_zhidao.sh +sh evaluate_unicom.sh +``` +用户也可以指定./run.sh中的TEST_DATA_PATH的值,通过下列命令评估自己指定的测试集。 +```shell +sh run.sh eval +``` + +#### 推测 +基于上面的预训练模型,可以运行下面的命令进行推测,并保存推测结果到本地。 +```shell +sh run.sh infer +``` +#### 训练与验证 +用户可以基于示例数据构建训练集和开发集,可以运行下面的命令,进行模型训练和开发集验证。 +```shell +sh run.sh train +``` +用户也可以指定./run.sh中train()函数里的INIT_CHECKPOINT的值,载入训练好的模型进行热启动训练。 +## 进阶使用 + +### 任务定义与建模 + +传统的文本匹配技术如信息检索中的向量空间模型 VSM、BM25 等算法,主要解决词汇层面的相似度问题,这种方法的效果在实际应用中受到语言的多义词和语言结构等问题影响。SimNet 在语义表示上沿袭了隐式连续向量表示的方式,但对语义匹配问题在深度学习框架下进行了 End-to-End 的建模,将```point-wise```与 ```pair-wise```两种有监督学习方式全部统一在一个整体框架内。在实际应用场景下,将海量的用户点击行为数据转化为大规模的弱标记数据,在网页搜索任务上的初次使用即展现出极大威力,带来了相关性的明显提升。 + +### 模型原理介绍 + +SimNet如下图所示: + +

+
+

+ +### 数据格式说明 + +训练模式一共分为```pairwise```和```pointwise```两种模式。 + +#### pairwise模式: +训练集格式如下: query \t pos_query \t neg_query。 +query、pos_query和neg_query是以空格分词的中文文本,中间使用制表符'\t'隔开,pos_query表示与query相似的正例,neg_query表示与query不相似的随机负例,文本编码为utf-8。
+``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 电信 手机 可以 用 腾讯 大王 卡 吗 ? +土豆 一亩地 能 收 多少 斤 一亩 地土豆 产 多少 斤 一亩 地 用 多少 斤 土豆 种子 +``` + +开发集和测试集格式:query1 \t query2 \t label。
+ +query1和query2表示以空格分词的中文文本,label为0或1,1表示query1与query2相似,0表示query1与query2不相似,query1、query2和label中间以制表符'\t'隔开,文本编码为utf-8。
+``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 1 +为什么 头发 掉 得 很厉害 我 头发 为什么 掉 得 厉害 1 +常喝 薏米 水 有 副 作用 吗 女生 可以 长期 喝 薏米 水养生 么 0 +长 的 清新 是 什么 意思 小 清新 的 意思 是 什么 0 +``` + +#### pointwise模式: + +训练集、开发集和测试集数据格式相同:query1和query2表示以空格分词的中文文本,label为0或1,1表示query1与query2相似,0表示query1与query2不相似,query1、query2和label中间以制表符'\t'隔开,文本编码为utf-8。 +``` +现在 安卓模拟器 哪个 好 用 电脑 安卓模拟器 哪个 更好 1 +为什么 头发 掉 得 很厉害 我 头发 为什么 掉 得 厉害 1 +常喝 薏米 水 有 副 作用 吗 女生 可以 长期 喝 薏米 水养生 么 0 +长 的 清新 是 什么 意思 小 清新 的 意思 是 什么 0 +``` + +#### infer数据集: + +```pairwise```和```pointwise```的infer数据集格式相同:query1 \t query2。
+ +query1和query2为以空格分词的中文文本。 +``` +怎么 调理 湿热 体质 ? 湿热 体质 怎样 调理 啊 +搞笑 电影 美国 搞笑 的 美国 电影 +``` + +__注__:本项目额外提供了分词预处理脚本(在preprocess目录下),可供用户使用,具体使用方法如下: + +```shell +python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.utf8.seg +``` +其中test.txt.utf8为待分词的文件,一条文本数据一行,utf8编码,分词结果存放在test.txt.utf8.seg文件中 + +### 代码结构说明 +```text +. +├── run_classifier.py:该项目的主函数,封装包括训练、预测、评估的部分 +├── config.py:定义该项目模型的配置类,读取具体模型类别、以及模型的超参数等 +├── reader.py:定义了读入数据的相关函数 +├── utils.py:定义了其他常用的功能函数 +├── Config: 定义多种模型的配置文件 +├── download.py: 下载数据及预训练模型脚本 +├── nets: 基于动态图的网络结构 +``` + +### 如何训练 +```shell +python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ #是否使用GPU + --do_train True \ #是否训练 + --do_valid True \ #是否在训练中测试开发集 + --do_test True \ #是否验证测试集 + --do_infer False \ #是否预测 + --batch_size 128 \ #batch_size的值 + --train_data_dir ${TRAIN_DATA_kPATH} \ #训练集的路径 + --valid_data_dir ${VALID_DATA_PATH} \ #开发集的路径 + --test_data_dir ${TEST_DATA_PATH} \ #测试集的路径 + --infer_data_dir ${INFER_DATA_PATH} \ #待推测数据的路径 + --output_dir ${CKPT_PATH} \ #模型存放的路径 + --config_path ${CONFIG_PATH} \ #配置文件路径 + --vocab_path ${VOCAB_PATH} \ #字典路径 + --epoch 10 \ #epoch值 + --save_steps 1000 \ #每save_steps保存一次模型 + --validation_steps 100 \ #每validation_steps验证一次开发集结果 + --task_mode ${TASK_MODE} #训练模式,pairwise或pointwise,与相应的配置文件匹配。 + --compute_accuracy False \ #是否计算accuracy + --lamda 0.91 \ #pairwise模式计算accuracy时的阈值 + --init_checkpoint "" #预加载模型路径 +``` +### 如何组建自己的模型 +用户可以根据自己的需求,组建自定义的模型,具体方法如下所示: + +i. 定义自己的网络结构 + +用户可以在```./nets/```下定义自己的模型; + +ii. 更改模型配置 + +用户仿照```config```中的文件生成自定义模型的配置文件。 + +用户需要保留配置文件中的```net```、```loss```、```optimizer```、```task_mode```和```model_path```字段。```net```为用户自定义的模型参数,```task_mode```表示训练模式,为```pairwise```或```pointwise```,要与训练命令中的```--task_mode```命令保持一致,```model_path```为模型保存路径,```loss```和```optimizer```依据自定义模型的需要仿照```config```下的其他文件填写。 + + +iii.模型训练,运行训练、评估、预测脚本即可(具体方法同上)。 + +## 其他 +### 如何贡献代码 +如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。 diff --git a/examples/similarity_net/config.py b/examples/similarity_net/config.py new file mode 100644 index 0000000..bfd3260 --- /dev/null +++ b/examples/similarity_net/config.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet config +""" + +import six +import json +import io + + +class SimNetConfig(object): + """ + simnet Config + """ + + def __init__(self, args): + self.task_mode = args.task_mode + self.config_path = args.config_path + self._config_dict = self._parse(args.config_path) + + def _parse(self, config_path): + try: + with io.open(config_path) as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing simnet model config file '%s'" % + config_path) + + else: + if config_dict["task_mode"] != self.task_mode: + raise ValueError( + "the config '{}' does not match the task_mode '{}'".format( + self.config_path, self.task_mode)) + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def __setitem__(self, key, value): + self._config_dict[key] = value + + def print_config(self): + """ + Print Config + """ + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') diff --git a/examples/similarity_net/download.py b/examples/similarity_net/download.py new file mode 100644 index 0000000..93b69f6 --- /dev/null +++ b/examples/similarity_net/download.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Download script, download dataset and pretrain models. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import time +import hashlib +import tarfile +import requests + + +def usage(): + desc = ( + "\nDownload datasets and pretrained models for SimilarityNet task.\n" + "Usage:\n" + " 1. python download.py dataset\n" + " 2. python download.py model\n") + print(desc) + + +def md5file(fname): + hash_md5 = hashlib.md5() + with io.open(fname, "rb") as fin: + for chunk in iter(lambda: fin.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def extract(fname, dir_path): + """ + Extract tar.gz file + """ + try: + tar = tarfile.open(fname, "r:gz") + file_names = tar.getnames() + for file_name in file_names: + tar.extract(file_name, dir_path) + print(file_name) + tar.close() + except Exception as e: + raise e + + +def download(url, filename, md5sum): + """ + Download file and check md5 + """ + retry = 0 + retry_limit = 3 + chunk_size = 4096 + + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError( + "Cannot download dataset ({0}) with retry {1} times.".format( + url, retry_limit)) + try: + start = time.time() + size = 0 + res = requests.get(url, stream=True) + filesize = int(res.headers['content-length']) + if res.status_code == 200: + print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024)) + # save by chunk + with io.open(filename, "wb") as fout: + for chunk in res.iter_content(chunk_size=chunk_size): + if chunk: + fout.write(chunk) + size += len(chunk) + pr = '>' * int(size * 50 / filesize) + print( + '\r[Process ]: %s%.2f%%' % + (pr, float(size / filesize * 100)), + end='') + end = time.time() + print("\n[CostTime]: %.2f s" % (end - start)) + except Exception as e: + print(e) + + +def download_dataset(dir_path): + BASE_URL = "https://baidu-nlp.bj.bcebos.com/" + DATASET_NAME = "simnet_dataset-1.0.0.tar.gz" + DATASET_MD5 = "ec65b313bc237150ef536a8d26f3c73b" + file_path = os.path.join(dir_path, DATASET_NAME) + url = BASE_URL + DATASET_NAME + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + # download dataset + print("Downloading dataset: %s" % url) + download(url, file_path, DATASET_MD5) + # extract dataset + print("Extracting dataset: %s" % file_path) + extract(file_path, dir_path) + os.remove(file_path) + + +def download_model(dir_path): + MODELS = {} + BASE_URL = "https://baidu-nlp.bj.bcebos.com/" + CNN_NAME = "simnet_bow_pairwise_dygraph.tar.gz" + CNN_MD5 = "30012af0ca8cdf0c613d8f56884f0f48" + MODELS[CNN_NAME] = CNN_MD5 + + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + for model in MODELS: + url = BASE_URL + model + model_path = os.path.join(dir_path, model) + print("Downloading model: %s" % url) + # download model + download(url, model_path, MODELS[model]) + # extract model.tar.gz + print("Extracting model: %s" % model_path) + extract(model_path, dir_path) + os.remove(model_path) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + usage() + sys.exit(1) + + if sys.argv[1] == "dataset": + pwd = os.path.join(os.path.dirname(__file__), './') + download_dataset(pwd) + elif sys.argv[1] == "model": + pwd = os.path.join(os.path.dirname(__file__), './model_files') + download_model(pwd) + else: + usage() diff --git a/examples/similarity_net/download_data.sh b/examples/similarity_net/download_data.sh new file mode 100644 index 0000000..ea1aaf9 --- /dev/null +++ b/examples/similarity_net/download_data.sh @@ -0,0 +1,5 @@ +#get data +wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz +tar xzf simnet_dataset-1.0.0.tar.gz +rm simnet_dataset-1.0.0.tar.gz + diff --git a/examples/similarity_net/nets/bow.py b/examples/similarity_net/nets/bow.py new file mode 100644 index 0000000..bbe9b14 --- /dev/null +++ b/examples/similarity_net/nets/bow.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +bow class +""" +import paddle.fluid as fluid +from paddle.fluid.dygraph import Linear, Layer, Embedding +from paddle.incubate.hapi.model import Model + + +#1. define BOWEncoder +class BOWEncoder(Layer): + """ + simple BOWEncoder for simnet + """ + + def __init__(self, dict_size, bow_dim, seq_len, emb_dim, padding_idx): + super(BOWEncoder, self).__init__() + self.dict_size = dict_size + self.bow_dim = bow_dim + self.seq_len = seq_len + self.emb_dim = emb_dim + self.padding_idx = padding_idx + self.emb_layer = Embedding( + size=[self.dict_size, self.emb_dim], + is_sparse=True, + padding_idx=self.padding_idx, + param_attr=fluid.ParamAttr( + name='emb', initializer=fluid.initializer.Xavier())) + + def forward(self, input): + emb = self.emb_layer(input) + emb_reshape = fluid.layers.reshape( + emb, shape=[-1, self.seq_len, self.bow_dim]) + bow_emb = fluid.layers.reduce_sum(emb_reshape, dim=1) + return bow_emb + + +class Pair_BOWModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Pair_BOWModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.bow_dim = conf_dict["net"]["bow_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, + self.emb_dim, self.padding_idx) + self.bow_layer = Linear( + input_dim=self.bow_dim, output_dim=self.bow_dim) + + def forward(self, left, pos_right, neg_right): + bow_left = self.emb_layer(left) + pos_bow_right = self.emb_layer(pos_right) + neg_bow_right = self.emb_layer(neg_right) + left_soft = fluid.layers.softsign(bow_left) + pos_right_soft = fluid.layers.softsign(pos_bow_right) + neg_right_soft = fluid.layers.softsign(neg_bow_right) + + left_bow = self.bow_layer(left_soft) + pos_right_bow = self.bow_layer(pos_right_soft) + neg_right_bow = self.bow_layer(neg_right_soft) + pos_pred = fluid.layers.cos_sim(left_bow, pos_right_bow) + neg_pred = fluid.layers.cos_sim(left_bow, neg_right_bow) + return pos_pred, neg_pred + + +class Point_BOWModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Point_BOWModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.bow_dim = conf_dict["net"]["bow_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + + self.emb_layer = BOWEncoder(self.dict_size, self.bow_dim, self.seq_len, + self.emb_dim, self.padding_idx) + self.bow_layer_po = Linear( + input_dim=self.bow_dim * 2, output_dim=self.bow_dim) + self.softmax_layer = Linear( + input_dim=self.bow_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + bow_left = self.emb_layer(left) + bow_right = self.emb_layer(right) + left_soft = fluid.layers.softsign(bow_left) + right_soft = fluid.layers.softsign(bow_right) + + concat = fluid.layers.concat([left_soft, right_soft], axis=1) + concat_fc = self.bow_layer_po(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/nets/cnn.py b/examples/similarity_net/nets/cnn.py new file mode 100644 index 0000000..97dbb2f --- /dev/null +++ b/examples/similarity_net/nets/cnn.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +cnn class +""" +import paddle.fluid as fluid +from paddle.fluid.dygraph import Linear, Layer, Conv2D, Pool2D +from paddle.incubate.hapi.model import Model +from paddle.incubate.hapi.text.text import CNNEncoder + + +class Pair_CNNModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Pair_CNNModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.filter_size = conf_dict["net"]["filter_size"] + self.num_filters = conf_dict["net"]["num_filters"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + #layers + self.encoder_layer = CNNEncoder( + num_channels=1, + num_filters=self.num_filters, + filter_size=self.filter_size, + pool_size=1, + layer_num=1, + act='relu') + self.fc_layer = Linear( + input_dim=self.num_filters * self.seq_len, + output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.num_filters * self.seq_len * 2, + output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, pos_right, neg_right): + left = fluid.layers.reshape( + left, shape=[-1, self.seq_len, self.hidden_dim]) + pos_right = fluid.layers.reshape( + pos_right, shape=[-1, self.seq_len, self.hidden_dim]) + neg_right = fluid.layers.reshape( + neg_right, shape=[-1, self.seq_len, self.hidden_dim]) + left_cnn = self.encoder_layer(left) + pos_right_cnn = self.encoder_layer(pos_right) + neg_right_cnn = self.encoder_layer(neg_right) + left_fc = self.fc_layer(left_cnn) + pos_right_fc = self.fc_layer(pos_right_cnn) + neg_right_fc = self.fc_layer(neg_right_cnn) + pos_pred = fluid.layers.cos_sim(left_fc, pos_right_fc) + neg_pred = fluid.layers.cos_sim(left_fc, neg_right_fc) + return pos_pred, neg_pred + + +class Point_CNNModel(Model): + """ + classify model + """ + + def __init__(self, conf_dict): + super(Point_CNNModel, self).__init__() + self.dict_size = conf_dict["dict_size"] + self.task_mode = conf_dict["task_mode"] + self.emb_dim = conf_dict["net"]["emb_dim"] + self.filter_size = conf_dict["net"]["filter_size"] + self.num_filters = conf_dict["net"]["num_filters"] + self.hidden_dim = conf_dict["net"]["hidden_dim"] + self.seq_len = conf_dict["seq_len"] + self.padding_idx = None + #layers + self.encoder_layer = CNNEncoder( + num_channels=1, + num_filters=self.num_filters, + filter_size=self.filter_size, + pool_size=1, + layer_num=1, + act='relu') + self.fc_layer = Linear( + input_dim=self.num_filters * self.seq_len, + output_dim=self.hidden_dim) + self.fc_layer_po = Linear( + input_dim=self.num_filters * self.seq_len * 2, + output_dim=self.hidden_dim) + self.softmax_layer = Linear( + input_dim=self.hidden_dim, output_dim=2, act='softmax') + + def forward(self, left, right): + left = fluid.layers.reshape( + left, shape=[-1, self.seq_len, self.hidden_dim]) + right = fluid.layers.reshape( + right, shape=[-1, self.seq_len, self.hidden_dim]) + left_cnn = self.encoder_layer(left) + right_cnn = self.encoder_layer(right) + concat = fluid.layers.concat([left_cnn, right_cnn], axis=1) + concat_fc = self.fc_layer_po(concat) + pred = self.softmax_layer(concat_fc) + return pred diff --git a/examples/similarity_net/nets/losses/hinge_loss.py b/examples/similarity_net/nets/losses/hinge_loss.py new file mode 100644 index 0000000..6081f8f --- /dev/null +++ b/examples/similarity_net/nets/losses/hinge_loss.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +hinge loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class HingeLoss(Loss): + def __init__(self, conf_dict): + super(HingeLoss, self).__init__() + self.margin = conf_dict["loss"]["margin"] + + def forward(self, outputs, labels=None): + pos, neg = outputs + loss = fluid.layers.fill_constant_batch_size_like(neg, neg.shape, + "float32", 0.0) + loss_margin = fluid.layers.fill_constant_batch_size_like( + neg, neg.shape, "float32", self.margin) + sub = fluid.layers.elementwise_sub(neg, pos) + add = fluid.layers.elementwise_add(sub, loss_margin) + loss_max = fluid.layers.elementwise_max(loss, add) + loss_last = fluid.layers.reduce_mean(loss_max) + return loss_last diff --git a/examples/similarity_net/nets/losses/log_loss.py b/examples/similarity_net/nets/losses/log_loss.py new file mode 100644 index 0000000..a11d1e4 --- /dev/null +++ b/examples/similarity_net/nets/losses/log_loss.py @@ -0,0 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +log loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class LogLoss(Loss): + def __init__(self, conf_dict): + super(LogLoss, self).__init__() + + def forward(self, outputs, labels=None): + pos, neg = outputs + loss = fluid.layers.sigmoid(neg - pos) + avg_loss = fluid.layers.reduce_mean(loss) + return loss diff --git a/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py new file mode 100644 index 0000000..ec19c5a --- /dev/null +++ b/examples/similarity_net/nets/losses/softmax_cross_entropy_loss.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +softmax loss +""" + +import sys +sys.path.append("../") +import paddle.fluid as fluid +from paddle.incubate.hapi.model import Loss + + +class SoftmxCrossEntropyLoss(Loss): + def __init__(self, conf_dict): + super(SoftmxCrossEntropyLoss, self).__init__() + + def forward(self, input, label): + cost = fluid.layers.cross_entropy(input=input, label=label) + avg_cost = fluid.layers.reduce_mean(cost) + return avg_cost diff --git a/examples/similarity_net/reader.py b/examples/similarity_net/reader.py new file mode 100644 index 0000000..d61a83c --- /dev/null +++ b/examples/similarity_net/reader.py @@ -0,0 +1,280 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet reader +""" + +import logging +import numpy as np +import io + + +class SimNetProcessor(object): + def __init__(self, args, vocab): + self.args = args + # load vocab + self.vocab = vocab + self.valid_label = np.array([]) + self.test_label = np.array([]) + + self.seq_len = args.seq_len + + def padding_text(self, x): + if len(x) < self.seq_len: + x += [0] * (self.seq_len - len(x)) + if len(x) > self.seq_len: + x = x[0:self.seq_len] + return x + + def get_reader(self, mode, epoch=0): + """ + Get Reader + """ + + def reader_with_pairwise(): + """ + Reader with Pairwise + """ + if mode == "valid": + with io.open( + self.args.valid_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + label = [1 if int(label) == 1 else 0] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = self.padding_text(label) + + yield [query, title, label] + elif mode == "test": + with io.open( + self.args.test_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + label = [1 if int(label) == 1 else 0] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = self.padding_text(label) + + yield [query, title, label] + else: + for idx in range(epoch): + with io.open( + self.args.train_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, pos_title, neg_title = line.strip().split( + "\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + pos_title = [ + self.vocab[word] + for word in pos_title.split(" ") + if word in self.vocab + ] + neg_title = [ + self.vocab[word] + for word in neg_title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(pos_title) == 0: + pos_title = [0] + if len(neg_title) == 0: + neg_title = [0] + + query = self.padding_text(query) + pos_title = self.padding_text(pos_title) + neg_title = self.padding_text(neg_title) + + yield [query, pos_title, neg_title] + + def reader_with_pointwise(): + """ + Reader with Pointwise + """ + if mode == "valid": + with io.open( + self.args.valid_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = int(label) + + yield [query, title, label] + elif mode == "test": + with io.open( + self.args.test_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + lebel = int(label) + + yield [query, title, label] + else: + for idx in range(epoch): + with io.open( + self.args.train_data_dir, "r", + encoding="utf8") as file: + for line in file: + query, title, label = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + if len(label) == 0: + label = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + label = int(label) + + yield [query, title, label] + + if self.args.task_mode == "pairwise": + return reader_with_pairwise + else: + return reader_with_pointwise + + def get_infer_reader(self): + """ + get infer reader + """ + with io.open(self.args.infer_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title = line.strip().split("\t") + query = [ + self.vocab[word] for word in query.split(" ") + if word in self.vocab + ] + title = [ + self.vocab[word] for word in title.split(" ") + if word in self.vocab + ] + if len(query) == 0: + query = [0] + if len(title) == 0: + title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + + yield [query, title] + + def get_infer_pairdata(self): + """ + get infer data + """ + with io.open(self.args.infer_data_dir, "r", encoding="utf8") as file: + for line in file: + query, title = line.strip().split("\t") + yield line.strip() + + def get_valid_label(self): + """ + get valid data label + """ + if self.valid_label.size == 0: + labels = [] + with io.open(self.args.valid_data_dir, "r", encoding="utf8") as f: + for line in f: + labels.append([int(line.strip().split("\t")[-1])]) + self.valid_label = np.array(labels) + return self.valid_label + + def get_test_label(self): + """ + get test data label + """ + if self.test_label.size == 0: + labels = [] + with io.open(self.args.test_data_dir, "r", encoding="utf8") as f: + for line in f: + labels.append([int(line.strip().split("\t")[-1])]) + self.test_label = np.array(labels) + return self.test_label diff --git a/examples/similarity_net/run.sh b/examples/similarity_net/run.sh new file mode 100644 index 0000000..69c4486 --- /dev/null +++ b/examples/similarity_net/run.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +export FLAGS_enable_parallel_graph=1 +export FLAGS_sync_nccl_allreduce=1 +export CUDA_VISIBLE_DEVICES=3 +export FLAGS_fraction_of_gpu_memory_to_use=0.95 +TASK_NAME='simnet' +TRAIN_DATA_PATH=./data/train_pairwise_data +VALID_DATA_PATH=./data/test_pairwise_data +TEST_DATA_PATH=./data/test_pairwise_data +INFER_DATA_PATH=./data/infer_data +VOCAB_PATH=./data/term2id.dict +CKPT_PATH=./model_files +TEST_RESULT_PATH=./test_result +INFER_RESULT_PATH=./infer_result +TASK_MODE='pairwise' +CONFIG_PATH=./config/bow_pairwise.json + +INIT_CHECKPOINT=./model_files/bow_pairwise/200 + + + +# run_train +train() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda False \ + --do_train True \ + --do_valid True \ + --do_infer False \ + --batch_size 128 \ + --train_data_dir ${TRAIN_DATA_PATH} \ + --valid_data_dir ${VALID_DATA_PATH} \ + --test_data_dir ${TEST_DATA_PATH} \ + --infer_data_dir ${INFER_DATA_PATH} \ + --output_dir ${CKPT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --epoch 40 \ + --save_steps 2000 \ + --validation_steps 200 \ + --compute_accuracy False \ + --lamda 0.958 \ + --task_mode ${TASK_MODE}\ + --init_checkpoint "" +} +#run_evaluate +evaluate() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ + --do_test True \ + --verbose_result True \ + --batch_size 128 \ + --test_data_dir ${TEST_DATA_PATH} \ + --test_result_path ${TEST_RESULT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --task_mode ${TASK_MODE} \ + --compute_accuracy False \ + --lamda 0.958 \ + --init_checkpoint ${INIT_CHECKPOINT} +} +# run_infer +infer() { + python run_classifier.py \ + --task_name ${TASK_NAME} \ + --use_cuda false \ + --do_infer True \ + --batch_size 128 \ + --infer_data_dir ${INFER_DATA_PATH} \ + --infer_result_path ${INFER_RESULT_PATH} \ + --config_path ${CONFIG_PATH} \ + --vocab_path ${VOCAB_PATH} \ + --task_mode ${TASK_MODE} \ + --init_checkpoint ${INIT_CHECKPOINT} +} + +main() { + local cmd=${1:-help} + case "${cmd}" in + train) + train "$@"; + ;; + eval) + evaluate "$@"; + ;; + infer) + infer "$@"; + ;; + help) + echo "Usage: ${BASH_SOURCE} {train|eval|infer}"; + return 0; + ;; + *) + echo "Unsupport commend [${cmd}]"; + echo "Usage: ${BASH_SOURCE} {train|eval|infer}"; + return 1; + ;; + esac +} +main "$@" \ No newline at end of file diff --git a/examples/similarity_net/run_classifier.py b/examples/similarity_net/run_classifier.py new file mode 100644 index 0000000..e5f120b --- /dev/null +++ b/examples/similarity_net/run_classifier.py @@ -0,0 +1,426 @@ +# -*- encoding: utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import six +import io +import warnings +import argparse +import multiprocessing + +import paddle +import paddle.fluid as fluid +from paddle.fluid.io import DataLoader +from functools import partial, reduce +import numpy as np +import reader +import config +from utils import load_vocab, import_class, get_accuracy, ArgConfig, print_arguments + +from paddle.incubate.hapi.metrics import Accuracy +from paddle.incubate.hapi.model import set_device, Model, Input, Loss, CrossEntropy + + +def train(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + # load auc method + metric = fluid.metrics.Auc(name="auc") + + def valid_and_test(pred_list, process, mode): + """ + return auc and acc + """ + pred_list = np.vstack(pred_list) + if mode == "test": + label_list = process.get_test_label() + elif mode == "valid": + label_list = process.get_valid_label() + if args.task_mode == "pairwise": + pred_list = (pred_list + 1) / 2 + pred_list = np.hstack( + (np.ones_like(pred_list) - pred_list, pred_list)) + metric.reset() + metric.update(pred_list, label_list) + auc = metric.eval() + if args.compute_accuracy: + acc = get_accuracy(pred_list, label_list, args.task_mode, + args.lamda) + return auc, acc + else: + return auc + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + loss = import_class("./nets/losses", conf_dict["loss"]["module_name"], + conf_dict["loss"]["class_name"])(conf_dict) + # Load Optimization method + learning_rate = conf_dict["optimizer"]["learning_rate"] + optimizer_name = conf_dict["optimizer"]["class_name"] + if optimizer_name == 'SGDOptimizer': + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate, parameter_list=model.parameters()) + elif optimizer_name == 'AdamOptimizer': + beta1 = conf_dict["optimizer"]["beta1"] + beta2 = conf_dict["optimizer"]["beta2"] + epsilon = conf_dict["optimizer"]["epsilon"] + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + parameter_list=model.parameters()) + + global_step = 0 + valid_step = 0 + losses = [] + + # define dataloader + simnet_process = reader.SimNetProcessor(args, vocab) + train_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_train_examples = simnet_process.get_reader("train", epoch=args.epoch) + train_pyreader.set_sample_list_generator( + fluid.io.batch( + get_train_examples, batch_size=args.batch_size), + places=device) + if args.do_valid: + valid_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_valid_examples = simnet_process.get_reader("valid") + valid_pyreader.set_sample_list_generator( + fluid.io.batch( + get_valid_examples, batch_size=args.batch_size), + places=device) + pred_list = [] + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right'), Input( + [None, 1], 'int64', name='neg_right') + ] + + model.prepare( + inputs=inputs, + optimizer=optimizer, + loss_function=loss, + device=device) + + for left, pos_right, neg_right in train_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + + final_loss = model.train_batch([input_left, pos_right, neg_right]) + print("train_steps: %d, train_loss: %f" % + (global_step, final_loss[0][0])) + losses.append(np.mean(final_loss)) + global_step += 1 + + if args.do_valid and global_step % args.validation_steps == 0: + for left, pos_right, neg_right in valid_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + + result, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list(result) + valid_step += 1 + + valid_result = valid_and_test(pred_list, simnet_process, + "valid") + if args.compute_accuracy: + valid_auc, valid_acc = valid_result + print( + "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, np.mean(losses))) + else: + valid_auc = valid_result + print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) + + if global_step % args.save_steps == 0: + model_save_dir = os.path.join(args.output_dir, + conf_dict["model_path"]) + model_path = os.path.join(model_save_dir, str(global_step)) + + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + model.save(model_path) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + label = [Input([None, 1], 'int64', name='neg_right')] + + model.prepare( + inputs=inputs, + optimizer=optimizer, + loss_function=loss, + device=device) + + for left, right, label in train_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_loss = model.train_batch([left, right], [label]) + print("train_steps: %d, train_loss: %f" % + (global_step, final_loss[0][0])) + losses.append(np.mean(final_loss)) + global_step += 1 + + if args.do_valid and global_step % args.validation_steps == 0: + for left, right, label in valid_pyreader(): + valid_left = fluid.layers.reshape(left, shape=[-1, 1]) + valid_right = fluid.layers.reshape(right, shape=[-1, 1]) + valid_label = fluid.layers.reshape(label, shape=[-1, 1]) + + result, _ = model.test_batch( + [valid_left, valid_right, valid_right]) + pred_list += list(result) + valid_step += 1 + + valid_result = valid_and_test(pred_list, simnet_process, + "valid") + if args.compute_accuracy: + valid_auc, valid_acc = valid_result + print( + "valid_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" + % (global_step, valid_auc, valid_acc, np.mean(losses))) + else: + valid_auc = valid_result + print("valid_steps: %d, valid_auc: %f, valid_loss: %f" % + (global_step, valid_auc, np.mean(losses))) + + if global_step % args.save_steps == 0: + model_save_dir = os.path.join(args.output_dir, + conf_dict["model_path"]) + model_path = os.path.join(model_save_dir, str(global_step)) + + if not os.path.exists(model_save_dir): + os.makedirs(model_save_dir) + model.save(model_path) + + +def test(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + metric = fluid.metrics.Auc(name="auc") + + def valid_and_test(pred_list, process, mode): + """ + return auc and acc + """ + pred_list = np.vstack(pred_list) + if mode == "test": + label_list = process.get_test_label() + elif mode == "valid": + label_list = process.get_valid_label() + if args.task_mode == "pairwise": + pred_list = (pred_list + 1) / 2 + pred_list = np.hstack( + (np.ones_like(pred_list) - pred_list, pred_list)) + metric.reset() + metric.update(pred_list, label_list) + auc = metric.eval() + if args.compute_accuracy: + acc = get_accuracy(pred_list, label_list, args.task_mode, + args.lamda) + return auc, acc + else: + return auc + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model.load(args.init_checkpoint) + + simnet_process = reader.SimNetProcessor(args, vocab) + test_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + get_test_examples = simnet_process.get_reader("test") + test_pyreader.set_sample_list_generator( + fluid.io.batch( + get_test_examples, batch_size=args.batch_size), + places=device) + + pred_list = [] + test_step = 0 + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right'), Input( + [None, 1], 'int64', name='pos_right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, pos_right, neg_right in test_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + final_pred, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list(final_pred) + test_step += 1 + + test_result = valid_and_test(pred_list, simnet_process, "test") + if args.compute_accuracy: + test_auc, test_acc = test_result + print("test_steps: %d, test_auc: %f, test_acc: %f" % + (test_step, test_auc, test_acc)) + else: + test_auc = test_result + print("test_steps: %d, test_auc: %f" % (test_step, test_auc)) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, right, label in test_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_pred = model.test_batch([left, right]) + print(final_pred) + pred_list += list(final_pred) + test_step += 1 + + test_result = valid_and_test(pred_list, simnet_process, "test") + if args.compute_accuracy: + test_auc, test_acc = test_result + print("test_steps: %d, test_auc: %f, test_acc: %f" % + (test_step, test_auc, test_acc)) + else: + test_auc = test_result + print("test_steps: %d, test_auc: %f" % (test_step, test_auc)) + + +def infer(conf_dict, args): + device = set_device("cpu") + fluid.enable_dygraph(device) + + # loading vocabulary + vocab = load_vocab(args.vocab_path) + # get vocab size + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically + model = import_class("./nets", conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model.load(args.init_checkpoint) + + simnet_process = reader.SimNetProcessor(args, vocab) + get_infer_examples = simnet_process.get_infer_reader + infer_pyreader = DataLoader.from_generator( + capacity=16, return_list=True, use_double_buffer=True) + infer_pyreader.set_sample_list_generator( + fluid.io.batch( + get_infer_examples, batch_size=args.batch_size), + places=device) + pred_list = [] + + if args.task_mode == "pairwise": + inputs = [ + Input( + [None, 1], 'int64', name='input_left'), Input( + [None, 1], 'int64', name='pos_right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, pos_right in infer_pyreader(): + input_left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + neg_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + final_pred, _ = model.test_batch( + [input_left, pos_right, neg_right]) + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), final_pred)) + print(pred_list) + + else: + inputs = [ + Input( + [None, 1], 'int64', name='left'), Input( + [None, 1], 'int64', name='right') + ] + + model.prepare(inputs=inputs, device=device) + + for left, right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + right = fluid.layers.reshape(right, shape=[-1, 1]) + # label = fluid.layers.reshape(label, shape=[-1, 1]) + + final_pred = model.test_batch([left, right]) + print(final_pred) + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), final_pred)) + + with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: + for _data, _pred in zip(simnet_process.get_infer_data(), + int(pred_list)): + infer_file.write(_data + "\t" + _pred + "\n") + + +if __name__ == '__main__': + args = ArgConfig() + args = args.build_conf() + print_arguments(args) + conf_dict = config.SimNetConfig(args) + + if args.do_train: + train(conf_dict, args) + elif args.do_test: + test(conf_dict, args) + elif args.do_infer: + infer(conf_dict, args) + else: + raise ValueError("one of do_train and do_infer must be True") diff --git a/examples/similarity_net/utils.py b/examples/similarity_net/utils.py new file mode 100644 index 0000000..e62b999 --- /dev/null +++ b/examples/similarity_net/utils.py @@ -0,0 +1,244 @@ +# -*- encoding:utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SimNet utilities. +""" +import argparse +import time +import sys +import re +import os +import six +import numpy as np +import paddle.fluid as fluid +import io +import pickle +import warnings +from functools import partial +from hapi.configure import ArgumentGroup, str2bool +""" +******functions for file processing****** +""" + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + f = io.open(file_path, "r", encoding="utf8") + for line in f: + items = line.strip("\n").split("\t") + if items[0] not in vocab: + vocab[items[0]] = int(items[1]) + vocab[""] = 0 + return vocab + + +def get_result_file(args): + """ + Get Result File + Args: + conf_dict: Input path config + samples_file_path: Data path of real training + predictions_file_path: Prediction results path + Returns: + result_file: merge sample and predict result + + """ + with io.open(args.test_data_dir, "r", encoding="utf8") as test_file: + with io.open( + "predictions.txt", "r", encoding="utf8") as predictions_file: + with io.open( + args.test_result_path, "w", + encoding="utf8") as test_result_file: + test_datas = [line.strip("\n") for line in test_file] + predictions = [line.strip("\n") for line in predictions_file] + for test_data, prediction in zip(test_datas, predictions): + test_result_file.write(test_data + "\t" + prediction + + "\n") + os.remove("predictions.txt") + + +def import_class(module_path, module_name, class_name): + """ + Load class dynamically + Args: + module_path: The current path of the module + module_name: The module name + class_name: The name of class in the import module + Return: + Return the attribute value of the class object + """ + if module_path: + sys.path.append(module_path) + module = __import__(module_name) + return getattr(module, class_name) + + +""" +******functions for string processing****** +""" + + +def pattern_match(pattern, line): + """ + Check whether a string is matched + Args: + pattern: mathing pattern + line : input string + Returns: + True/False + """ + if re.match(pattern, line): + return True + else: + return False + + +""" +******functions for parameter processing****** +""" + + +def print_progress(task_name, percentage, style=0): + """ + Print progress bar + Args: + task_name: The name of the current task + percentage: Current progress + style: Progress bar form + """ + styles = ['#', '█'] + mark = styles[style] * percentage + mark += ' ' * (100 - percentage) + status = '%d%%' % percentage if percentage < 100 else 'Finished' + sys.stdout.write('%+20s [%s] %s\r' % (task_name, mark, status)) + sys.stdout.flush() + time.sleep(0.002) + + +class ArgConfig(object): + def __init__(self): + parser = argparse.ArgumentParser() + + model_g = ArgumentGroup(parser, "model", + "model configuration and paths.") + model_g.add_arg("config_path", str, None, + "Path to the json file for EmoTect model config.") + model_g.add_arg("init_checkpoint", str, None, + "Init checkpoint to resume training from.") + model_g.add_arg("output_dir", str, None, + "Directory path to save checkpoints") + model_g.add_arg("task_mode", str, None, + "task mode: pairwise or pointwise") + + train_g = ArgumentGroup(parser, "training", "training options.") + train_g.add_arg("epoch", int, 10, "Number of epoches for training.") + train_g.add_arg("save_steps", int, 20, + "The steps interval to save checkpoints.") + train_g.add_arg("validation_steps", int, 100, + "The steps interval to evaluate model performance.") + + infer_g = ArgumentGroup(parser, "inferring", "inferring related") + infer_g.add_arg("test_result_path", str, "test_result", + "Directory path to test result.") + infer_g.add_arg("infer_result_path", str, "infer_result.txt", + "Directory path to infer result.") + + data_g = ArgumentGroup( + parser, "data", + "Data paths, vocab paths and data processing options") + data_g.add_arg("train_data_dir", str, None, + "Directory path to training data.") + data_g.add_arg("valid_data_dir", str, None, + "Directory path to valid data.") + data_g.add_arg("test_data_dir", str, None, + "Directory path to testing data.") + data_g.add_arg("infer_data_dir", str, None, + "Directory path to infer data.") + data_g.add_arg("vocab_path", str, None, "Vocabulary path.") + data_g.add_arg("batch_size", int, 32, + "Total examples' number in batch for training.") + data_g.add_arg("seq_len", int, 32, "The length of each sentence.") + + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") + run_type_g.add_arg("use_cuda", bool, False, + "If set, use GPU for training.") + run_type_g.add_arg( + "task_name", str, None, + "The name of task to perform sentiment classification.") + run_type_g.add_arg("do_train", bool, False, + "Whether to perform training.") + run_type_g.add_arg("do_valid", bool, False, "Whether to perform dev.") + #run_type_g.add_arg("do_test", bool, False, "Whether to perform testing.") + run_type_g.add_arg("do_infer", bool, False, + "Whether to perform inference.") + run_type_g.add_arg("compute_accuracy", bool, False, + "Whether to compute accuracy.") + run_type_g.add_arg( + "lamda", float, 0.91, + "When task_mode is pairwise, lamda is the threshold for calculating the accuracy." + ) + + custom_g = ArgumentGroup(parser, "customize", "customized options.") + self.custom_g = custom_g + + #parser.add_argument('--enable_ce',action='store_true',help='If set, run the task with continuous evaluation logs.') + + self.parser = parser + + def add_arg(self, name, dtype, default, descrip): + self.custom_g.add_arg(name, dtype, default, descrip) + + def build_conf(self): + return self.parser.parse_args() + + +def print_arguments(args): + """ + Print Arguments + """ + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def get_softmax(preds): + """ + compute sotfmax + """ + _exp = np.exp(preds) + return _exp / np.sum(_exp, axis=1, keepdims=True) + + +def get_sigmoid(preds): + """ + compute sigmoid + """ + return 1 / (1 + np.exp(-preds)) + + +def get_accuracy(preds, labels, mode, lamda=0.958): + """ + compute accuracy + """ + if mode == "pairwise": + preds = np.array(list(map(lambda x: 1 if x[1] >= lamda else 0, preds))) + else: + preds = np.array(list(map(lambda x: np.argmax(x), preds))) + labels = np.squeeze(labels) + return np.mean(preds[0:len(labels)] == labels) -- GitLab