From e17aa9446ae198efd124e2976431f4f40fb67787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=82=96?= Date: Fri, 21 Feb 2020 22:49:06 +0800 Subject: [PATCH] fix bugs of dygraph/similarity_net (#4322) * Update README.md (#4267) * test=develop (#4269) * 3d use new api (#4275) * PointNet++ and PointRCNN use new API * Update Readme of Dygraph BERT (#4277) Fix some typos. * Update run_classifier_multi_gpu.sh (#4279) remove the CUDA_VISIBLE_DEVICES * Update README.md (#4280) * add similarity_net dygraph * fix similarity_net dygraph * fix bugs of dygraph/similarity_net Co-authored-by: pkpk Co-authored-by: Kaipeng Deng --- PaddleNLP/similarity_net/run_classifier.py | 2 +- dygraph/similarity_net/README.md | 23 +++-- dygraph/similarity_net/download.py | 5 +- .../download_pretrained_model.sh | 7 +- .../similarity_net/evaluate/evaluate_ecom.sh | 3 +- .../similarity_net/evaluate/evaluate_qqsim.sh | 3 +- .../evaluate/evaluate_unicom.sh | 3 +- .../evaluate/evaluate_zhidao.sh | 3 +- dygraph/similarity_net/nets/bow.py | 9 +- dygraph/similarity_net/nets/cnn.py | 2 +- dygraph/similarity_net/nets/gru.py | 2 +- dygraph/similarity_net/nets/lstm.py | 2 +- dygraph/similarity_net/nets/mm_dnn.py | 20 ++-- dygraph/similarity_net/nets/paddle_layers.py | 36 +------ dygraph/similarity_net/reader.py | 42 +++++++- dygraph/similarity_net/run.sh | 4 +- dygraph/similarity_net/run_classifier.py | 98 +++++++++++-------- dygraph/similarity_net/utils.py | 3 + 18 files changed, 150 insertions(+), 117 deletions(-) diff --git a/PaddleNLP/similarity_net/run_classifier.py b/PaddleNLP/similarity_net/run_classifier.py index 944bb111..9fbb3384 100644 --- a/PaddleNLP/similarity_net/run_classifier.py +++ b/PaddleNLP/similarity_net/run_classifier.py @@ -532,4 +532,4 @@ if __name__ == "__main__": infer(conf_dict, args) else: raise ValueError( - "one of do_train and do_test and do_infer must be True") + "one of do_train and do_test and do_infer must be True") \ No newline at end of file diff --git a/dygraph/similarity_net/README.md b/dygraph/similarity_net/README.md index b9c147b5..0d75e04a 100644 --- a/dygraph/similarity_net/README.md +++ b/dygraph/similarity_net/README.md @@ -1,18 +1,17 @@ # 短文本语义匹配 ## 简介 ### 任务说明 -短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。可通过[AI开放平台-短文本相似度](https://ai.baidu.com/tech/nlp_basic/simnet)线上体验。 - -同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124373) +短文本语义匹配(SimilarityNet, SimNet)是一个计算短文本相似度的框架,可以根据用户输入的两个文本,计算出相似度得分。SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MMDNN等核心网络结构形式,提供语义相似度计算训练和预测框架,适用于信息检索、新闻推荐、智能客服等多个应用场景,帮助企业解决语义匹配问题。 ### 效果说明 -基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和语义匹配数据集(LCQMC)进行评测,效果如下表所示。LCQMC数据集以Accuracy为评测指标,而pairwise模型的输出为相似度,因此我们采用0.958作为分类阈值,相比于基线模型中网络结构同等复杂的CBOW模型(准确率为0.737),我们模型的准确率为0.7532。 +基于百度海量搜索数据,我们训练了一个SimNet-BOW-Pairwise语义匹配模型,在一些真实的FAQ问答场景中,该模型效果比基于字面的相似度方法AUC提升5%以上,我们基于百度自建测试集(包含聊天、客服等数据集)和进行评测,效果如下表所示。 | 模型 | 百度知道 | ECOM |QQSIM | UNICOM | |:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| | | AUC | AUC | AUC|正逆序比| -|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630| +|BOW_Pairwise|0.6815|0.7331|0.7638|1.5566| + #### 测试集说明 | 数据集 | 来源 | 垂类 | |:-----------:|:-------------:|:-------------:| @@ -22,17 +21,21 @@ |UNICOM|联通客服|客服| ## 快速开始 #### 版本依赖 -本项目依赖于 Paddlepaddle Fluid 1.6,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + +本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。 + python版本依赖python 2.7 #### 安装代码 克隆工具集代码库到本地 ```shell git clone https://github.com/PaddlePaddle/models.git -cd models/PaddleNLP/similarity_net + +cd models/dygraph/similarity_net ``` #### 数据准备 -下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 +下载经过预处理的数据,运行命令后,data目录下会存在训练集数据示例、测试集数据示例,以及对应词索引字典(term2id.dict)。 + ```shell sh download_data.sh ``` @@ -46,6 +49,7 @@ python download.py dataset sh download_pretrained_model.sh ``` 或者 + ``` python download.py model ``` @@ -146,6 +150,7 @@ python tokenizer.py --test_data_dir ./test.txt.utf8 --batch_size 1 > test.txt.ut ├── utils.py:定义了其他常用的功能函数 ├── Config: 定义多种模型的配置文件 ├── download.py: 下载数据及预训练模型脚本 +├── nets: 基于动态图的网络结构 ``` ### 如何训练 @@ -178,7 +183,7 @@ python run_classifier.py \ i. 定义自己的网络结构 -用户可以在```./models/```下定义自己的模型; +用户可以在```./nets/```下定义自己的模型; ii. 更改模型配置 diff --git a/dygraph/similarity_net/download.py b/dygraph/similarity_net/download.py index 52b53def..e2140887 100644 --- a/dygraph/similarity_net/download.py +++ b/dygraph/similarity_net/download.py @@ -67,6 +67,7 @@ def download(url, filename, md5sum): retry = 0 retry_limit = 3 chunk_size = 4096 + while not (os.path.exists(filename) and md5file(filename) == md5sum): if retry < retry_limit: retry += 1 @@ -115,8 +116,8 @@ def download_dataset(dir_path): def download_model(dir_path): MODELS = {} BASE_URL = "https://baidu-nlp.bj.bcebos.com/" - CNN_NAME = "simnet_bow-pairwise-1.0.0.tar.gz" - CNN_MD5 = "199a3f3af31558edcc71c3b54ea5e129" + CNN_NAME = "simnet_bow_pairwise_dygraph.tar.gz" + CNN_MD5 = "30012af0ca8cdf0c613d8f56884f0f48" MODELS[CNN_NAME] = CNN_MD5 if not os.path.exists(dir_path): diff --git a/dygraph/similarity_net/download_pretrained_model.sh b/dygraph/similarity_net/download_pretrained_model.sh index 287e8dcc..691aff89 100644 --- a/dygraph/similarity_net/download_pretrained_model.sh +++ b/dygraph/similarity_net/download_pretrained_model.sh @@ -2,9 +2,10 @@ model_files_path="./model_files" #get pretrained_bow_pairwise_model -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow-pairwise-1.0.0.tar.gz + +wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_bow_pairwise_dygraph.tar.gz if [ ! -d $model_files_path ]; then mkdir $model_files_path fi -tar xzf simnet_bow-pairwise-1.0.0.tar.gz -C $model_files_path -rm simnet_bow-pairwise-1.0.0.tar.gz \ No newline at end of file +tar xzf simnet_bow_pairwise_dygraph.tar.gz -C $model_files_path +rm simnet_bow_pairwise_dygraph.tar.gz diff --git a/dygraph/similarity_net/evaluate/evaluate_ecom.sh b/dygraph/similarity_net/evaluate/evaluate_ecom.sh index 4a00efab..371c9258 100644 --- a/dygraph/similarity_net/evaluate/evaluate_ecom.sh +++ b/dygraph/similarity_net/evaluate/evaluate_ecom.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/ecom_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/evaluate/evaluate_qqsim.sh b/dygraph/similarity_net/evaluate/evaluate_qqsim.sh index fa8bdcc0..383e451b 100644 --- a/dygraph/similarity_net/evaluate/evaluate_qqsim.sh +++ b/dygraph/similarity_net/evaluate/evaluate_qqsim.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/qqsim_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/evaluate/evaluate_unicom.sh b/dygraph/similarity_net/evaluate/evaluate_unicom.sh index a93aaa4b..ec8352c0 100644 --- a/dygraph/similarity_net/evaluate/evaluate_unicom.sh +++ b/dygraph/similarity_net/evaluate/evaluate_unicom.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files INFER_RESULT_PATH=./evaluate/unicom_infer_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + python unicom_split.py cd .. diff --git a/dygraph/similarity_net/evaluate/evaluate_zhidao.sh b/dygraph/similarity_net/evaluate/evaluate_zhidao.sh index 9e634610..951b55d5 100644 --- a/dygraph/similarity_net/evaluate/evaluate_zhidao.sh +++ b/dygraph/similarity_net/evaluate/evaluate_zhidao.sh @@ -10,7 +10,8 @@ CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/zhidao_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + cd .. python ./run_classifier.py \ diff --git a/dygraph/similarity_net/nets/bow.py b/dygraph/similarity_net/nets/bow.py index d2897419..b85b98e4 100644 --- a/dygraph/similarity_net/nets/bow.py +++ b/dygraph/similarity_net/nets/bow.py @@ -18,9 +18,10 @@ bow class import paddle_layers as layers from paddle import fluid from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph import Layer, Embedding +from paddle.fluid.dygraph import Layer, Embedding, Linear import paddle.fluid.param_attr as attr uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) + class BOW(Layer): """ BOW @@ -35,10 +36,11 @@ class BOW(Layer): self.task_mode = conf_dict["task_mode"] self.emb_dim = conf_dict["net"]["emb_dim"] self.bow_dim = conf_dict["net"]["bow_dim"] - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] self.emb_layer = layers.EmbeddingLayer(self.dict_size, self.emb_dim, "emb").ops() - self.bow_layer = layers.FCLayer(self.bow_dim, None, "fc").ops() + self.bow_layer = Linear(self.bow_dim, self.bow_dim) self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() + def forward(self, left, right): """ @@ -46,7 +48,6 @@ class BOW(Layer): """ # embedding layer - left_emb = self.emb_layer(left) right_emb = self.emb_layer(right) left_emb = fluid.layers.reshape( diff --git a/dygraph/similarity_net/nets/cnn.py b/dygraph/similarity_net/nets/cnn.py index 8e7951ac..d3f7fc87 100644 --- a/dygraph/similarity_net/nets/cnn.py +++ b/dygraph/similarity_net/nets/cnn.py @@ -35,7 +35,7 @@ class CNN(Layer): self.filter_size = conf_dict["net"]["filter_size"] self.num_filters = conf_dict["net"]["num_filters"] self.hidden_dim = conf_dict["net"]["hidden_dim"] - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] self.channels = 1 # layers diff --git a/dygraph/similarity_net/nets/gru.py b/dygraph/similarity_net/nets/gru.py index eb7e1bd1..b505a3a0 100644 --- a/dygraph/similarity_net/nets/gru.py +++ b/dygraph/similarity_net/nets/gru.py @@ -43,7 +43,7 @@ class GRU(Layer): self.fc_layer = layers.FCLayer(self.hidden_dim, None, "fc").ops() self.proj_layer = Linear(input_dim = self.hidden_dim, output_dim=self.gru_dim*3) self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() - self.seq_len=5 + self.seq_len=conf_dict["seq_len"] def forward(self, left, right): """ diff --git a/dygraph/similarity_net/nets/lstm.py b/dygraph/similarity_net/nets/lstm.py index c099625d..9f53928a 100644 --- a/dygraph/similarity_net/nets/lstm.py +++ b/dygraph/similarity_net/nets/lstm.py @@ -38,7 +38,7 @@ class LSTM(Layer): self.fc_layer = layers.FCLayer(self.hidden_dim, None, "fc").ops() self.softmax_layer = layers.FCLayer(2, "softmax", "cos_sim").ops() self.proj_layer = Linear(input_dim = self.hidden_dim, output_dim=self.lstm_dim*4) - self.seq_len = 5 + self.seq_len = conf_dict["seq_len"] def forward(self, left, right): diff --git a/dygraph/similarity_net/nets/mm_dnn.py b/dygraph/similarity_net/nets/mm_dnn.py index ce1679a2..0601b55e 100644 --- a/dygraph/similarity_net/nets/mm_dnn.py +++ b/dygraph/similarity_net/nets/mm_dnn.py @@ -42,10 +42,11 @@ class MMDNN(Layer): self.dpool_size1 = int(config['net']['dpool_size_left']) self.dpool_size2 = int(config['net']['dpool_size_right']) self.hidden_size = int(config['net']['hidden_size']) - - self.seq_len1 = 5 + self.seq_len = int(conf_dict["seq_len"]) + self.seq_len1 = self.seq_len #int(config['max_len_left']) - self.seq_len2 = 5 #int(config['max_len_right']) + self.seq_len2 = self.seq_len + #int(config['max_len_right']) self.task_mode = config['task_mode'] self.zero_pad = True self.scale = False @@ -130,14 +131,14 @@ class MMDNN(Layer): right_seq_encoder = fluid.layers.concat([right_lstm, right_reverse], axis=1) pad_value = fluid.layers.assign(input=np.array([0]).astype("float32")) - - left_seq_encoder = fluid.layers.reshape(left_seq_encoder, shape=[left_seq_encoder.shape[0]/5,5,-1]) - right_seq_encoder = fluid.layers.reshape(right_seq_encoder, shape=[right_seq_encoder.shape[0]/5,5,-1]) + left_seq_encoder = fluid.layers.reshape(left_seq_encoder, shape=[int(left_seq_encoder.shape[0]/self.seq_len),self.seq_len,-1]) + right_seq_encoder = fluid.layers.reshape(right_seq_encoder, shape=[int(right_seq_encoder.shape[0]/self.seq_len),self.seq_len,-1]) cross = fluid.layers.matmul( left_seq_encoder, right_seq_encoder, transpose_y=True) - left_lens=to_variable(np.array([5])) - right_lens=to_variable(np.array([5])) + left_lens=to_variable(np.array([self.seq_len])) + right_lens=to_variable(np.array([self.seq_len])) + if self.match_mask: mask1 = fluid.layers.sequence_mask( @@ -157,7 +158,8 @@ class MMDNN(Layer): if mask is not None: cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=0) cross_mask = fluid.layers.stack(x=[cross] * conv.shape[1], axis=1) - conv = cross_mask * conv + (1 - cross_mask) * (-2**5 + 1) + conv = cross_mask * conv + (1 - cross_mask) * (-2**self.seq_len + 1) + pool = self.pool_layer(conv) conv_pool_relu = fluid.layers.relu(pool) diff --git a/dygraph/similarity_net/nets/paddle_layers.py b/dygraph/similarity_net/nets/paddle_layers.py index cf9f2e39..8c32076b 100644 --- a/dygraph/similarity_net/nets/paddle_layers.py +++ b/dygraph/similarity_net/nets/paddle_layers.py @@ -20,7 +20,7 @@ import inspect import six import sys from functools import partial - +from functools import reduce import numpy as np import paddle import paddle.fluid as fluid @@ -1047,37 +1047,3 @@ class BasicGRUUnit(Layer): return new_hidden - -###### DELETE - -# @contextlib.contextmanager -# def eager_guard(is_eager): -# if is_eager: -# with fluid.dygraph.guard(): -# yield -# else: -# yield - -# # print(flatten(np.random.rand(2,8,8))) -# random_seed = 123 -# np.random.seed(random_seed) -# # print np.random.rand(2, 8) -# batch_size = 2 -# seq_len = 8 -# hidden_size = 8 -# vocab_size, embed_dim, num_layers, hidden_size = 100, 8, 2, 8 -# import torch - - -# with eager_guard(False): -# fluid.default_main_program().random_seed = random_seed -# fluid.default_startup_program().random_seed = random_seed -# lstm_cell = BasicLSTMUnit(hidden_size=8, input_size=8) -# lstm = RNN(cell=lstm_cell, time_major=True) -# #print lstm(inputs=to_variable(np.random.rand(2, 8, 8).astype("float32")))[0].numpy() -# executor.run(fluid.default_startup_program()) -# x = fluid.data(name="x", shape=[None, None, 8], dtype="float32") -# out, _ = lstm(x) -# out = executor.run(feed={"x": np.random.rand(2, 8, 8).astype("float32")}, fetch_list=[out.name])[0] -# print np.array(out) - diff --git a/dygraph/similarity_net/reader.py b/dygraph/similarity_net/reader.py index c38c8c6d..cda39b8e 100644 --- a/dygraph/similarity_net/reader.py +++ b/dygraph/similarity_net/reader.py @@ -28,6 +28,16 @@ class SimNetProcessor(object): self.valid_label = np.array([]) self.test_label = np.array([]) + self.seq_len = args.seq_len + + def padding_text(self, x): + if len(x) < self.seq_len: + x += [0]*(self.seq_len-len(x)) + if len(x) > self.seq_len: + x = x[0:self.seq_len] + return x + + def get_reader(self, mode, epoch=0): """ Get Reader @@ -60,6 +70,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] elif mode == "test": with io.open(self.args.test_data_dir, "r", encoding="utf8") as file: @@ -83,8 +97,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] - # query = np.array([x.reshape(-1,1) for x in query]).astype('int64') - # title = np.array([x.reshape(-1,1) for x in title]).astype('int64') + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] else: for idx in range(epoch): @@ -115,7 +131,11 @@ class SimNetProcessor(object): pos_title = [0] if len(neg_title) == 0: neg_title = [0] - + + query = self.padding_text(query) + pos_title = self.padding_text(pos_title) + neg_title = self.padding_text(neg_title) + yield [query, pos_title, neg_title] def reader_with_pointwise(): @@ -145,6 +165,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] elif mode == "test": with io.open(self.args.test_data_dir, "r", encoding="utf8") as file: @@ -168,6 +192,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] else: for idx in range(epoch): @@ -194,6 +222,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title, label] if self.args.task_mode == "pairwise": @@ -223,6 +255,10 @@ class SimNetProcessor(object): query = [0] if len(title) == 0: title = [0] + + query = self.padding_text(query) + title = self.padding_text(title) + yield [query, title] def get_infer_data(self): diff --git a/dygraph/similarity_net/run.sh b/dygraph/similarity_net/run.sh index 318f012e..657505a5 100644 --- a/dygraph/similarity_net/run.sh +++ b/dygraph/similarity_net/run.sh @@ -14,7 +14,9 @@ TEST_RESULT_PATH=./test_result INFER_RESULT_PATH=./infer_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json -INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ + +INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/bow_pairwise + # run_train diff --git a/dygraph/similarity_net/run_classifier.py b/dygraph/similarity_net/run_classifier.py index 6da678f7..b790927e 100644 --- a/dygraph/similarity_net/run_classifier.py +++ b/dygraph/similarity_net/run_classifier.py @@ -96,6 +96,8 @@ def train(conf_dict, args): vocab = utils.load_vocab(args.vocab_path) # get vocab size conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + # Load network structure dynamically net = utils.import_class("./nets", conf_dict["net"]["module_name"], @@ -302,6 +304,7 @@ def test(conf_dict, args): else: place = fluid.CPUPlace() with fluid.dygraph.guard(place): + vocab = utils.load_vocab(args.vocab_path) simnet_process = reader.SimNetProcessor(args, vocab) test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) @@ -310,7 +313,9 @@ def test(conf_dict, args): paddle.batch(get_test_examples, batch_size=args.batch_size), place) - conf_dict['dict_size'] = len(vocab) + + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len net = utils.import_class("./nets", conf_dict["net"]["module_name"], @@ -329,18 +334,22 @@ def test(conf_dict, args): left_feat, pos_score = net(left, pos_right) pred = pos_score # pred_list += list(pred.numpy()) - pred_list += list(map(lambda item: float(item[0]), pred.numpy()[0])) + + pred_list += list(map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str((item[0] + 1) / 2), pred.numpy()[0])) + "\n") + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + "\n") + else: for left, right in test_pyreader(): left = fluid.layers.reshape(left, shape=[-1, 1]) right = fluid.layers.reshape(right, shape=[-1, 1]) left_feat, pred = net(left, right) # pred_list += list(pred.numpy()) - pred_list += list(map(lambda item: float(item[0]), pred.numpy()[0])) + + pred_list += list(map(lambda item: float(item[0]), pred.numpy())) predictions_file.write(u"\n".join( - map(lambda item: str(np.argmax(item)), pred.numpy()[0])) + "\n") + map(lambda item: str(np.argmax(item)), pred.numpy())) + "\n") + if args.task_mode == "pairwise": pred_list = np.array(pred_list).reshape((-1, 1)) @@ -377,45 +386,48 @@ def infer(conf_dict, args): place = fluid.CPUPlace() - vocab = utils.load_vocab(args.vocab_path) - simnet_process = reader.SimNetProcessor(args, vocab) - get_infer_examples = simnet_process.get_infer_reader - infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) - infer_pyreader.decorate_sample_list_generator( - paddle.batch(get_infer_examples, batch_size=args.batch_size), - place) - - conf_dict['dict_size'] = len(vocab) - - net = utils.import_class("./nets", - conf_dict["net"]["module_name"], - conf_dict["net"]["class_name"])(conf_dict) - model, _ = fluid.dygraph.load_dygraph(args.init_checkpoint) - net.set_dict(model) - pred_list = [] - if args.task_mode == "pairwise": - for left, pos_right in infer_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - - left_feat, pos_score = net(left, pos_right) - pred = pos_score - preds_list += list( - map(lambda item: str((item[0] + 1) / 2), pred.numpy()[0])) + with fluid.dygraph.guard(place): + vocab = utils.load_vocab(args.vocab_path) + simnet_process = reader.SimNetProcessor(args, vocab) + get_infer_examples = simnet_process.get_infer_reader + infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=False) + infer_pyreader.decorate_sample_list_generator( + paddle.batch(get_infer_examples, batch_size=args.batch_size), + place) - else: - for left, right in infer_pyreader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(right, shape=[-1, 1]) - left_feat, pred = net(left, right) - preds_list += map(lambda item: str(np.argmax(item)), pred.numpy()[0]) - - - with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: - for _data, _pred in zip(simnet_process.get_infer_data(), preds_list): - infer_file.write(_data + "\t" + _pred + "\n") - logging.info("infer result saved in %s" % - os.path.join(os.getcwd(), args.infer_result_path)) + conf_dict['dict_size'] = len(vocab) + conf_dict['seq_len'] = args.seq_len + + net = utils.import_class("./nets", + conf_dict["net"]["module_name"], + conf_dict["net"]["class_name"])(conf_dict) + model, _ = fluid.dygraph.load_dygraph(args.init_checkpoint) + net.set_dict(model) + + pred_list = [] + if args.task_mode == "pairwise": + for left, pos_right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) + + left_feat, pos_score = net(left, pos_right) + pred = pos_score + pred_list += list( + map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + + else: + for left, right in infer_pyreader(): + left = fluid.layers.reshape(left, shape=[-1, 1]) + pos_right = fluid.layers.reshape(right, shape=[-1, 1]) + left_feat, pred = net(left, right) + pred_list += map(lambda item: str(np.argmax(item)), pred.numpy()) + + + with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file: + for _data, _pred in zip(simnet_process.get_infer_data(), pred_list): + infer_file.write(_data + "\t" + _pred + "\n") + logging.info("infer result saved in %s" % + os.path.join(os.getcwd(), args.infer_result_path)) def get_cards(): diff --git a/dygraph/similarity_net/utils.py b/dygraph/similarity_net/utils.py index 76fcc363..afb87b59 100644 --- a/dygraph/similarity_net/utils.py +++ b/dygraph/similarity_net/utils.py @@ -175,6 +175,7 @@ class ArgConfig(object): model_g.add_arg("output_dir", str, None, "Directory path to save checkpoints") model_g.add_arg("task_mode", str, None, "task mode: pairwise or pointwise") + train_g = ArgumentGroup(parser, "training", "training options.") train_g.add_arg("epoch", int, 10, "Number of epoches for training.") train_g.add_arg("save_steps", int, 200, "The steps interval to save checkpoints.") @@ -193,6 +194,8 @@ class ArgConfig(object): data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.") + data_g.add_arg("seq_len", int, 32, "The length of each sentence.") + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") -- GitLab