diff --git a/PaddleNLP/similarity_net/run_classifier.py b/PaddleNLP/similarity_net/run_classifier.py index 944bb1117bde232cdb7b6631428376832a0937ad..9fbb338490afad0a0d438f86c24613464dc6dac8 100644 --- a/PaddleNLP/similarity_net/run_classifier.py +++ b/PaddleNLP/similarity_net/run_classifier.py @@ -532,4 +532,4 @@ if __name__ == "__main__": infer(conf_dict, args) else: raise ValueError( - "one of do_train and do_test and do_infer must be True") + "one of do_train and do_test and do_infer must be True") \ No newline at end of file diff --git a/dygraph/similarity_net/README.md b/dygraph/similarity_net/README.md index b9c147b5134589679f49763676cf4313bb28ef5d..0d75e04a7c8cf9f44f7e27f6e8138c54c3518505 100644 --- a/dygraph/similarity_net/README.md +++ b/dygraph/similarity_net/README.md @@ -1,18 +1,17 @@ # 短文本语义匹配 ## 简介 ### 任务说明 -短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。可通过[AI开放平台-短文本相似度](https://ai.baidu.com/tech/nlp_basic/simnet)线上体验。 - -同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124373) +短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。 ### 效果说明 -基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。 +基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和进行评测,效果如下表所示。 | 模型 | 百度知道 | ECOM |QQSIM | UNICOM | |:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| | | AUC | AUC | AUC|正逆序比| -|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630| +|BOW_Pairwise|0.6815|0.7331|0.7638|1.5566| + #### 测试集说明 | 数据集 | 来源 | 垂类 | |:-----------:|:-------------:|:-------------:| @@ -22,17 +21,21 @@ |UNICOM|联通客服|客服| ## 快速开始 #### 版本依赖 -本项目依赖于 Paddlepaddle Fluid 1.6,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + +本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + python版本依赖python 2.7 #### 安装代码 克隆工具集代码库到本地 ```shell git clone https://github.com/PaddlePaddle/models.git -cd models/PaddleNLP/similarity_net + +cd models/dygraph/similarity_net ``` #### 数据准备 -下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 +下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 + ```shell sh download_data.sh ``` @@ -46,6 +49,7 @@ python download.py dataset sh download_pretrained_model.sh ``` 或者 + ``` python download.py model ``` @@ -146,6 +150,7 @@ python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.ut ├── utils.py:定义了其他常用的功能函数 ├── Config: 定义多种模型的配置文件 ├── download.py: 下载数据及预训练模型脚本 +├── nets: 基于动态图的网络结构 ``` ### 如何训练 @@ -178,7 +183,7 @@ python run_classifier.py \ i. 定义自己的网络结构 -用户可以在```./models/```下定义自己的模型; +用户可以在```./nets/```下定义自己的模型; ii. 更改模型配置 diff --git a/dygraph/similarity_net/download.py b/dygraph/similarity_net/download.py index 52b53def4959e17443b0e64e918c0ffce1fd71e4..e2140887fcf7a7423415fd8b6e4ce7e001e57d29 100644 --- a/dygraph/similarity_net/download.py +++ b/dygraph/similarity_net/download.py @@ -67,6 +67,7 @@ def download(url, filename, md5sum): retry = 0 retry_limit = 3 chunk_size = 4096 + while not (os.path.exists(filename) and md5file(filename) == md5sum): if retry < retry_limit: retry += 1 @@ -115,8 +116,8 @@ def download_dataset(dir_path): def download_model(dir_path): MODELS = {} BASE_URL = "https://baidu-nlp.bj.bcebos.com/" - CNN_NAME = "simnet_bow-pairwise-1.0.0.tar.gz" - CNN_MD5 = "199a3f3af31558edcc71c3b54ea5e129" + CNN_NAME = "simnet_bow_pairwise_dygraph.tar.gz" + CNN_MD5 = "30012af0ca8cdf0c613d8f56884f0f48" MODELS[CNN_NAME] = CNN_MD5 if not os.path.exists(dir_path): diff --git a/dygraph/similarity_net/download_pretrained_model.sh b/dygraph/similarity_net/download_pretrained_model.sh index 287e8dcc189f5b7960f7fa2be90f9f6ab1524e84..691aff8997e8f251c0205866b91421b0133f3832 100644 --- a/dygraph/similarity_net/download_pretrained_model.sh +++ b/dygraph/similarity_net/download_pretrained_model.sh @@ -2,9 +2,10 @@ model_files_path="./model_files" #get pretrained_bow_pairwise_model -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz + +wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow_pairwise_dygraph.tar.gz if [ ! -d $model_files_path ]; then mkdir $model_files_path fi -tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C $model_files_path -rm simnet_bow-pairwise-1.0.0.tar.gz \ No newline at end of file +tar xzf simnet_bow_pairwise_dygraph.tar.gz -C $model_files_path +rm simnet_bow_pairwise_dygraph.tar.gz diff --git a/dygraph/similarity_net/evaluate/evaluate_ecom.sh b/dygraph/similarity_net/evaluate/evaluate_ecom.sh index 4a00efabae4f6bf07c8a7e73c3d214d1c8a4276f..371c9258eb0c0acfe8904b6195b0bb2f06983f47 100644 --- a/dygraph/similarity_net/evaluate/evaluate_ecom.sh +++ b/dygraph/similarity_net/evaluate/evaluate_ecom.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/ecom_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/evaluate/evaluate_qqsim.sh b/dygraph/similarity_net/evaluate/evaluate_qqsim.sh index fa8bdcc034af4c77a199e52af1e0d23471fefeec..383e451b46f89d1b97a85d5c2fe9d3d47fed05be 100644 --- a/dygraph/similarity_net/evaluate/evaluate_qqsim.sh +++ b/dygraph/similarity_net/evaluate/evaluate_qqsim.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/qqsim_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/evaluate/evaluate_unicom.sh b/dygraph/similarity_net/evaluate/evaluate_unicom.sh index a93aaa4bc2c7a75dfad1e4f9d1f59ae9711947f1..ec8352c018524d54f88489c44620ea110b6936a5 100644 --- a/dygraph/similarity_net/evaluate/evaluate_unicom.sh +++ b/dygraph/similarity_net/evaluate/evaluate_unicom.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files INFER_RESULT_PATH=./evaluate/unicom_infer_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + python unicom_split.py cd .. diff --git a/dygraph/similarity_net/evaluate/evaluate_zhidao.sh b/dygraph/similarity_net/evaluate/evaluate_zhidao.sh index 9e6346104c8bbb8540f39e58a404eb566f43b0ed..951b55d55671f54b3550e7a54f07d4413952a2cd 100644 --- a/dygraph/similarity_net/evaluate/evaluate_zhidao.sh +++ b/dygraph/similarity_net/evaluate/evaluate_zhidao.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/zhidao_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/nets/bow.py b/dygraph/similarity_net/nets/bow.py index d2897419d7b4756f054b02af6c8cfadd4a33ae42..b85b98e4b8be3d978d1951aa9a53bea6ba8a8310 100644 --- a/dygraph/similarity_net/nets/bow.py +++ b/dygraph/similarity_net/nets/bow.py @@ -18,9 +18,10 @@ bow class import paddle_layers as layers from paddle import fluid from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph import Layer, Embedding +from paddle.fluid.dygraph import Layer, Embedding, Linear import paddle.fluid.param_attr as attr uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) + class BOW(Layer): """ BOW @@ -35,10 +36,11 @@ class BOW(Layer): self.task_mode = conf_dict["task_mode"] self.emb_dim = conf_dict["net"]["emb_dim"] self.bow_dim = conf_dict["net"]["bow_dim"] - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] self.emb_layer = layers.EmbeddingLayer(self.dict_size, self.emb_dim, "emb").ops() - self.bow_layer = layers.FCLayer(self.bow_dim, None, "fc").ops() + self.bow_layer = Linear(self.bow_dim, self.bow_dim) self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() + def forward(self, left, right): """ @@ -46,7 +48,6 @@ class BOW(Layer): """ # embedding layer - left_emb = self.emb_layer(left) right_emb = self.emb_layer(right) left_emb = fluid.layers.reshape( diff --git a/dygraph/similarity_net/nets/cnn.py b/dygraph/similarity_net/nets/cnn.py index 8e7951ac74a55cef465580216357625632b5c038..d3f7fc87759e019ab059450bcafb3c6a0a871000 100644 --- a/dygraph/similarity_net/nets/cnn.py +++ b/dygraph/similarity_net/nets/cnn.py @@ -35,7 +35,7 @@ class CNN(Layer): self.filter_size = conf_dict["net"]["filter_size"] self.num_filters = conf_dict["net"]["num_filters"] self.hidden_dim = conf_dict["net"]["hidden_dim"] - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] self.channels = 1 # layers diff --git a/dygraph/similarity_net/nets/gru.py b/dygraph/similarity_net/nets/gru.py index eb7e1bd18ddc0d6751293738d55eb5ebfa82bdfd..b505a3a0cd70c82b703b9f3d898fb30c2bec0175 100644 --- a/dygraph/similarity_net/nets/gru.py +++ b/dygraph/similarity_net/nets/gru.py @@ -43,7 +43,7 @@ class GRU(Layer): self.fc_layer = layers.FCLayer(self.hidden_dim, None, "fc").ops() self.proj_layer = Linear(input_dim = self.hidden_dim, output_dim=self.gru_dim*3) self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() - self.seq_len=5 + self.seq_len=conf_dict["seq_len"] def forward(self, left, right): """ diff --git a/dygraph/similarity_net/nets/lstm.py b/dygraph/similarity_net/nets/lstm.py index c099625d17dffe8566f49b31d2a6e097e49296cc..9f53928a00015f013e68f546834e8c23ad3d7735 100644 --- a/dygraph/similarity_net/nets/lstm.py +++ b/dygraph/similarity_net/nets/lstm.py @@ -38,7 +38,7 @@ class LSTM(Layer): self.fc_layer = layers.FCLayer(self.hidden_dim, None, "fc").ops() self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() self.proj_layer = Linear(input_dim = self.hidden_dim, output_dim=self.lstm_dim*4) - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] def forward(self, left, right): diff --git a/dygraph/similarity_net/nets/mm_dnn.py b/dygraph/similarity_net/nets/mm_dnn.py index ce1679a2a3b75e7cc598be9ff49d1383996bf176..0601b55ee40b373865657387336f101c8f78225d 100644 --- a/dygraph/similarity_net/nets/mm_dnn.py +++ b/dygraph/similarity_net/nets/mm_dnn.py @@ -42,10 +42,11 @@ class MMDNN(Layer): self.dpool_size1 = int(config['net']['dpool_size_left']) self.dpool_size2 = int(config['net']['dpool_size_right']) self.hidden_size = int(config['net']['hidden_size']) - - self.seq_len1 = 5 + self.seq_len = int(conf_dict["seq_len"]) + self.seq_len1 = self.seq_len #int(config['max_len_left']) - self.seq_len2 = 5 #int(config['max_len_right']) + self.seq_len2 = self.seq_len + #int(config['max_len_right']) self.task_mode = config['task_mode'] self.zero_pad = True self.scale = False @@ -130,14 +131,14 @@ class MMDNN(Layer): right_seq_encoder = fluid.layers.concat([right_lstm, right_reverse], axis=1) pad_value = fluid.layers.assign(input=np.array([0]).astype("float32")) - - left_seq_encoder = fluid.layers.reshape(left_seq_encoder, shape=[left_seq_encoder.shape[0]/5,5,-1]) - right_seq_encoder = fluid.layers.reshape(right_seq_encoder, shape=[right_seq_encoder.shape[0]/5,5,-1]) + left_seq_encoder = fluid.layers.reshape(left_seq_encoder, shape=[int(left_seq_encoder.shape[0]/self.seq_len),self.seq_len,-1]) + right_seq_encoder = fluid.layers.reshape(right_seq_encoder, shape=[int(right_seq_encoder.shape[0]/self.seq_len),self.seq_len,-1]) cross = fluid.layers.matmul( left_seq_encoder, right_seq_encoder, transpose_y=True) - left_lens=to_variable(np.array([5])) - right_lens=to_variable(np.array([5])) + left_lens=to_variable(np.array([self.seq_len])) + right_lens=to_variable(np.array([self.seq_len])) + if self.match_mask: mask1 = fluid.layers.sequence_mask( @@ -157,7 +158,8 @@ class MMDNN(Layer): if mask is not None: cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=0) cross_mask = fluid.layers.stack(x=[cross] * conv.shape[1], axis=1) - conv = cross_mask * conv + (1 - cross_mask) * (-2**5 + 1) + conv = cross_mask * conv + (1 - cross_mask) * (-2**self.seq_len + 1) + pool = self.pool_layer(conv) conv_pool_relu = fluid.layers.relu(pool) diff --git a/dygraph/similarity_net/nets/paddle_layers.py b/dygraph/similarity_net/nets/paddle_layers.py index cf9f2e399eb9350aaf460b4d5d303f467fbea74e..8c32076bdc5e0f28507d74bd08b024fe95bbf30d 100644 --- a/dygraph/similarity_net/nets/paddle_layers.py +++ b/dygraph/similarity_net/nets/paddle_layers.py @@ -20,7 +20,7 @@ import inspect import six import sys from functools import partial - +from functools import reduce import numpy as np import paddle import paddle.fluid as fluid @@ -1047,37 +1047,3 @@ class BasicGRUUnit(Layer): return new_hidden - -###### DELETE - -# @contextlib.contextmanager -# def eager_guard(is_eager): -# if is_eager: -# with fluid.dygraph.guard(): -# yield -# else: -# yield - -# # print(flatten(np.random.rand(2,8,8))) -# random_seed = 123 -# np.random.seed(random_seed) -# # print np.random.rand(2, 8) -# batch_size = 2 -# seq_len = 8 -# hidden_size = 8 -# vocab_size, embed_dim, num_layers, hidden_size = 100, 8, 2, 8 -# import torch - - -# with eager_guard(False): -# fluid.default_main_program().random_seed = random_seed -# fluid.default_startup_program().random_seed = random_seed -# lstm_cell = BasicLSTMUnit(hidden_size=8, input_size=8) -# lstm = RNN(cell=lstm_cell, time_major=True) -# #print lstm(inputs=to_variable(np.random.rand(2, 8, 8).astype("float32")))[0].numpy() -# executor.run(fluid.default_startup_program()) -# x = fluid.data(name="x", shape=[None, None, 8], dtype="float32") -# out, _ = lstm(x) -# out = executor.run(feed={"x": np.random.rand(2, 8, 8).astype("float32")}, fetch_list=[out.name])[0] -# print np.array(out) - diff --git a/dygraph/similarity_net/reader.py b/dygraph/similarity_net/reader.py index c38c8c6df3b3e7f79884cc000b92cba924794bc5..cda39b8e4055ff5646d933deae75843e60ac833b 100644 --- a/dygraph/similarity_net/reader.py +++ b/dygraph/similarity_net/reader.py @@ -28,6 +28,16 @@ class SimNetProcessor(object): self.valid_label = np.array([]) self.test_label = np.array([]) + self.seq_len = args.seq_len + + def padding_text(self, x): + if len(x) < self.seq_len: + x += [0]*(self.seq_len-len(x)) + if len(x) > self.seq_len: + x = x[0:self.seq_len] + return x + + def get_reader(self, mode, epoch=0): """ Get Reader @@ -60,6 +70,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] elif mode == "test": with io.open(self.args.test_data_dir, "r", encoding="utf8") as file: @@ -83,8 +97,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] - # query = np.array([x.reshape(-1,1) for x in query]).astype('int64') - # title = np.array([x.reshape(-1,1) for x in title]).astype('int64') + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] else: for idx in range(epoch): @@ -115,7 +131,11 @@ class SimNetProcessor(object): pos_title = [0] if len(neg_title) == 0: neg_title = [0] - + + query = self.padding_text(query) + pos_title = self.padding_text(pos_title) + neg_title = self.padding_text(neg_title) + yield [query, pos_title, neg_title] def reader_with_pointwise(): @@ -145,6 +165,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] elif mode == "test": with io.open(self.args.test_data_dir, "r", encoding="utf8") as file: @@ -168,6 +192,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] else: for idx in range(epoch): @@ -194,6 +222,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title, label] if self.args.task_mode == "pairwise": @@ -223,6 +255,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] def get_infer_data(self): diff --git a/dygraph/similarity_net/run.sh b/dygraph/similarity_net/run.sh index 318f012ec895d6870022e0cdb64ce1d363b78a53..657505a56e0fc31e6ade5c959845a275ac809bfb 100644 --- a/dygraph/similarity_net/run.sh +++ b/dygraph/similarity_net/run.sh @@ -14,7 +14,9 @@ TEST_RESULT_PATH=./test_result INFER_RESULT_PATH=./infer_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ + +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + # run_train diff --git a/dygraph/similarity_net/run_classifier.py b/dygraph/similarity_net/run_classifier.py index 6da678f73c2c9d809df54756c5a5e04a9b918e38..b790927ec42ddeb01c77c3cc613d6e2e68eb02f5 100644 --- a/dygraph/similarity_net/run_classifier.py +++ b/dygraph/similarity_net/run_classifier.py @@ -96,6 +96,8 @@ def train(conf_dict, args): vocab = utils.load_vocab(args.vocab_path) # get vocab size conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically net = utils.import_class("./nets", conf_dict["net"]["module_name"], @@ -302,6 +304,7 @@ def test(conf_dict, args): else: place = fluid.CPUPlace() with fluid.dygraph.guard(place): + vocab = utils.load_vocab(args.vocab_path) simnet_process = reader.SimNetProcessor(args, vocab) test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) @@ -310,7 +313,9 @@ def test(conf_dict, args): paddle.batch(get_test_examples, batch_size=args.batch_size), place) - conf_dict['dict_size'] = len(vocab) + + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len net = utils.import_class("./nets", conf_dict["net"]["module_name"], @@ -329,18 +334,22 @@ def test(conf_dict, args): left_feat, pos_score = net(left, pos_right) pred = pos_score # pred_list += list(pred.numpy()) - pred_list += list(map(lambda item: float(item[0]), pred.numpy()[0])) + + pred_list += list(map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str((item[0] + 1) / 2), pred.numpy()[0])) + "\n") + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + "\n") + else: for left, right in test_pyreader(): left = fluid.layers.reshape(left, shape=[-1, 1]) right = fluid.layers.reshape(right, shape=[-1, 1]) left_feat, pred = net(left, right) # pred_list += list(pred.numpy()) - pred_list += list(map(lambda item: float(item[0]), pred.numpy()[0])) + + pred_list += list(map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str(np.argmax(item)), pred.numpy()[0])) + "\n") + map(lambda item: str(np.argmax(item)), pred.numpy())) + "\n") + if args.task_mode == "pairwise": pred_list = np.array(pred_list).reshape((-1, 1)) @@ -377,45 +386,48 @@ def infer(conf_dict, args): place = fluid.CPUPlace() - vocab = utils.load_vocab(args.vocab_path) - simnet_process = reader.SimNetProcessor(args, vocab) - get_infer_examples = simnet_process.get_infer_reader - infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) - infer_pyreader.decorate_sample_list_generator( - paddle.batch(get_infer_examples, batch_size=args.batch_size), - place) - - conf_dict['dict_size'] = len(vocab) - - net = utils.import_class("./nets", - conf_dict["net"]["module_name"], - conf_dict["net"]["class_name"])(conf_dict) - model, _ = fluid.dygraph.load_dygraph(args.init_checkpoint) - net.set_dict(model) - pred_list = [] - if args.task_mode == "pairwise": - for left, pos_right in infer_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - - left_feat, pos_score = net(left, pos_right) - pred = pos_score - preds_list += list( - map(lambda item: str((item[0] + 1) / 2), pred.numpy()[0])) + with fluid.dygraph.guard(place): + vocab = utils.load_vocab(args.vocab_path) + simnet_process = reader.SimNetProcessor(args, vocab) + get_infer_examples = simnet_process.get_infer_reader + infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) + infer_pyreader.decorate_sample_list_generator( + paddle.batch(get_infer_examples, batch_size=args.batch_size), + place) - else: - for left, right in infer_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(right, shape=[-1, 1]) - left_feat, pred = net(left, right) - preds_list += map(lambda item: str(np.argmax(item)), pred.numpy()[0]) - - - with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: - for _data, _pred in zip(simnet_process.get_infer_data(), preds_list): - infer_file.write(_data + "\t" + _pred + "\n") - logging.info("infer result saved in %s" % - os.path.join(os.getcwd(), args.infer_result_path)) + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + + net = utils.import_class("./nets", + conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model, _ = fluid.dygraph.load_dygraph(args.init_checkpoint) + net.set_dict(model) + + pred_list = [] + if args.task_mode == "pairwise": + for left, pos_right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + left_feat, pos_score = net(left, pos_right) + pred = pos_score + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + + else: + for left, right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(right, shape=[-1, 1]) + left_feat, pred = net(left, right) + pred_list += map(lambda item: str(np.argmax(item)), pred.numpy()) + + + with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: + for _data, _pred in zip(simnet_process.get_infer_data(), pred_list): + infer_file.write(_data + "\t" + _pred + "\n") + logging.info("infer result saved in %s" % + os.path.join(os.getcwd(), args.infer_result_path)) def get_cards(): diff --git a/dygraph/similarity_net/utils.py b/dygraph/similarity_net/utils.py index 76fcc3630941cf02817b797302b5f229f5539308..afb87b59717440ca1a41c4aebe9a1a4d29d5b57c 100644 --- a/dygraph/similarity_net/utils.py +++ b/dygraph/similarity_net/utils.py @@ -175,6 +175,7 @@ class ArgConfig(object): model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints") model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise") + train_g = ArgumentGroup(parser, "training", "training options.") train_g.add_arg("epoch", int, 10, "Number of epoches for training.") train_g.add_arg("save_steps", int, 200, "The steps interval to save checkpoints.") @@ -193,6 +194,8 @@ class ArgConfig(object): data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.") + data_g.add_arg("seq_len", int, 32, "The length of each sentence.") + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")