diff --git a/PaddleNLP/similarity_net/README.md b/PaddleNLP/similarity_net/README.md index 90208707cd051ab418a8b7f9f3e534e09b1a6b05..4976eff140557214025c8afc93f43e8d9ee6d0ce 100644 --- a/PaddleNLP/similarity_net/README.md +++ b/PaddleNLP/similarity_net/README.md @@ -12,7 +12,8 @@ | 模型 | 百度知道 | ECOM |QQSIM | UNICOM | |:-----------:|:-------------:|:-------------:|:-------------:|:-------------:| | | AUC | AUC | AUC|正逆序比| -|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630| +|BOW_Pairwise(WordSeg)|0.6767|0.7329|0.7650|1.5630| +|BOW_Pairwise(Jieba)|0.6658|0.7351|0.8431|1.5331| #### 测试集说明 | 数据集 | 来源 | 垂类 | |:-----------:|:-------------:|:-------------:| @@ -51,7 +52,10 @@ python download.py model ``` #### 评估 -我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。 +我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集,基于上面的预训练模型,用户可以进入 evaluate 目录下依次执行下列命令获取测试集评估结果。 + +我们在以下评估脚本中以 Jieba 切词作为示例,如果您需要自定义切词模块,只需要在 [`tokenization.py`](tokenization.py) 中参考 `JiebaTokenizer` 实现自定义的切词类, 并且在 `evaluate_*.sh` 评估脚本中配置环境变量 `TOKENIZER=${YOUR_TOKENIZER_NAME}` 即可, 如果 `TOKENIZER` 环境变量为空, 则默认输入数据是切词后的数据(示例给出的数据是百度切词工具 WordSeg 切词后的数据) + ```shell sh evaluate_ecom.sh sh evaluate_qqsim.sh diff --git a/PaddleNLP/similarity_net/download.py b/PaddleNLP/similarity_net/download.py index 52b53def4959e17443b0e64e918c0ffce1fd71e4..d93b7e8ad3a2a67d96587354f5231ce306ab3563 100644 --- a/PaddleNLP/similarity_net/download.py +++ b/PaddleNLP/similarity_net/download.py @@ -96,8 +96,8 @@ def download(url, filename, md5sum): def download_dataset(dir_path): BASE_URL = "https://baidu-nlp.bj.bcebos.com/" - DATASET_NAME = "simnet_dataset-1.0.0.tar.gz" - DATASET_MD5 = "ec65b313bc237150ef536a8d26f3c73b" + DATASET_NAME = "simnet_dataset-1.0.1.tar.gz" + DATASET_MD5 = "4a381770178721b539e7cf0f91a8777d" file_path = os.path.join(dir_path, DATASET_NAME) url = BASE_URL + DATASET_NAME diff --git a/PaddleNLP/similarity_net/download_data.sh b/PaddleNLP/similarity_net/download_data.sh index ea1aaf9cb984e3e93a7b36e0c7db1357af1a19c2..47f0bf93caf12bfc8e38a70fba175237c33f304b 100644 --- a/PaddleNLP/similarity_net/download_data.sh +++ b/PaddleNLP/similarity_net/download_data.sh @@ -1,5 +1,4 @@ #get data -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz -tar xzf simnet_dataset-1.0.0.tar.gz -rm simnet_dataset-1.0.0.tar.gz - +wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.1.tar.gz +tar xzf simnet_dataset-1.0.1.tar.gz +rm simnet_dataset-1.0.1.tar.gz diff --git a/PaddleNLP/similarity_net/evaluate/evaluate_ecom.sh b/PaddleNLP/similarity_net/evaluate/evaluate_ecom.sh index 4a00efabae4f6bf07c8a7e73c3d214d1c8a4276f..3423afa7757a4c5f0c57acf09ed54de8f7eaa98d 100644 --- a/PaddleNLP/similarity_net/evaluate/evaluate_ecom.sh +++ b/PaddleNLP/similarity_net/evaluate/evaluate_ecom.sh @@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1 export CUDA_VISIBLE_DEVICES=3 export FLAGS_fraction_of_gpu_memory_to_use=0.95 TASK_NAME='simnet' -TEST_DATA_PATH=./data/ecom VOCAB_PATH=./data/term2id.dict CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/ecom_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ + +# use JiebaTokenizer to evaluate +TOKENIZER="JiebaTokenizer" +TEST_DATA_PATH=./data/ecom_raw + +# use tokenized data by WordSeg to evaluate +#TOKENIZER="" +#TEST_DATA_PATH=./data/ecom + cd .. python ./run_classifier.py \ @@ -23,5 +31,6 @@ python ./run_classifier.py \ --test_result_path ${TEST_RESULT_PATH} \ --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ + --tokenizer ${TOKENIZER:-""} \ --task_mode ${TASK_MODE} \ --init_checkpoint ${INIT_CHECKPOINT} diff --git a/PaddleNLP/similarity_net/evaluate/evaluate_qqsim.sh b/PaddleNLP/similarity_net/evaluate/evaluate_qqsim.sh index fa8bdcc034af4c77a199e52af1e0d23471fefeec..3cc66d12b47774a9d532f55074c1af5ad047b313 100644 --- a/PaddleNLP/similarity_net/evaluate/evaluate_qqsim.sh +++ b/PaddleNLP/similarity_net/evaluate/evaluate_qqsim.sh @@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1 export CUDA_VISIBLE_DEVICES=3 export FLAGS_fraction_of_gpu_memory_to_use=0.95 TASK_NAME='simnet' -TEST_DATA_PATH=./data/qqsim VOCAB_PATH=./data/term2id.dict CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/qqsim_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ + +# use JiebaTokenizer to evaluate +TOKENIZER="JiebaTokenizer" +TEST_DATA_PATH=./data/qqsim_raw + +# use tokenized data by WordSeg to evaluate +#TOKENIZER="" +#TEST_DATA_PATH=./data/qqsim + cd .. python ./run_classifier.py \ @@ -23,5 +31,6 @@ python ./run_classifier.py \ --test_result_path ${TEST_RESULT_PATH} \ --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ + --tokenizer ${TOKENIZER:-""} \ --task_mode ${TASK_MODE} \ --init_checkpoint ${INIT_CHECKPOINT} diff --git a/PaddleNLP/similarity_net/evaluate/evaluate_unicom.sh b/PaddleNLP/similarity_net/evaluate/evaluate_unicom.sh index a93aaa4bc2c7a75dfad1e4f9d1f59ae9711947f1..3aca0942de26a9daf79075ad97515d3ed9454555 100644 --- a/PaddleNLP/similarity_net/evaluate/evaluate_unicom.sh +++ b/PaddleNLP/similarity_net/evaluate/evaluate_unicom.sh @@ -4,7 +4,6 @@ export FLAGS_sync_nccl_allreduce=1 export CUDA_VISIBLE_DEVICES=3 export FLAGS_fraction_of_gpu_memory_to_use=0.95 TASK_NAME='simnet' -INFER_DATA_PATH=./evaluate/unicom_infer VOCAB_PATH=./data/term2id.dict CKPT_PATH=./model_files INFER_RESULT_PATH=./evaluate/unicom_infer_result @@ -12,6 +11,14 @@ TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ +# use JiebaTokenizer to evaluate +TOKENIZER="JiebaTokenizer" +INFER_DATA_PATH=./data/unicom_infer_raw + +# use tokenized data by WordSeg to evaluate +#TOKENIZER="" +#INFER_DATA_PATH=./evaluate/unicom_infer + python unicom_split.py cd .. python ./run_classifier.py \ @@ -23,8 +30,8 @@ python ./run_classifier.py \ --infer_result_path ${INFER_RESULT_PATH} \ --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ + --tokenizer ${TOKENIZER:-""} \ --task_mode ${TASK_MODE} \ --init_checkpoint ${INIT_CHECKPOINT} cd evaluate python unicom_compute_pos_neg.py - diff --git a/PaddleNLP/similarity_net/evaluate/evaluate_zhidao.sh b/PaddleNLP/similarity_net/evaluate/evaluate_zhidao.sh index 9e6346104c8bbb8540f39e58a404eb566f43b0ed..0a33c892c9e11740a6eaea59028ab24c44ee8bc6 100644 --- a/PaddleNLP/similarity_net/evaluate/evaluate_zhidao.sh +++ b/PaddleNLP/similarity_net/evaluate/evaluate_zhidao.sh @@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1 export CUDA_VISIBLE_DEVICES=3 export FLAGS_fraction_of_gpu_memory_to_use=0.95 TASK_NAME='simnet' -TEST_DATA_PATH=./data/zhidao VOCAB_PATH=./data/term2id.dict CKPT_PATH=./model_files TEST_RESULT_PATH=./evaluate/zhidao_test_result TASK_MODE='pairwise' CONFIG_PATH=./config/bow_pairwise.json INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/ + +# use JiebaTokenizer to evaluate +TOKENIZER="JiebaTokenizer" +TEST_DATA_PATH=./data/zhidao_raw + +# use tokenized data by WordSeg to evaluate +#TOKENIZER="" +#TEST_DATA_PATH=./data/zhidao + cd .. python ./run_classifier.py \ @@ -23,5 +31,6 @@ python ./run_classifier.py \ --test_result_path ${TEST_RESULT_PATH} \ --config_path ${CONFIG_PATH} \ --vocab_path ${VOCAB_PATH} \ + --tokenizer ${TOKENIZER:-""} \ --task_mode ${TASK_MODE} \ --init_checkpoint ${INIT_CHECKPOINT} diff --git a/PaddleNLP/similarity_net/reader.py b/PaddleNLP/similarity_net/reader.py index c9a68a8d8317f6842e58d6a9f79aec0b4d617cc2..36aca1bbea8be33025331f9e6422c3a76fdef437 100644 --- a/PaddleNLP/similarity_net/reader.py +++ b/PaddleNLP/similarity_net/reader.py @@ -19,6 +19,7 @@ import logging import numpy as np import io +import tokenization class SimNetProcessor(object): def __init__(self, args, vocab): @@ -27,6 +28,10 @@ class SimNetProcessor(object): self.vocab = vocab self.valid_label = np.array([]) self.test_label = np.array([]) + if args.tokenizer: + self.tokenizer = getattr(tokenization, args.tokenizer)() + else: + self.tokenizer = None def get_reader(self, mode, epoch=0): """ @@ -48,6 +53,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -71,6 +82,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -95,6 +112,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + pos_title = self.tokenizer.tokenize(pos_title) + neg_title = self.tokenizer.tokenize(neg_title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -130,6 +153,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -153,6 +182,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -178,6 +213,12 @@ class SimNetProcessor(object): logging.warning( "line not match format in test file") continue + + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) + query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab @@ -208,6 +249,10 @@ class SimNetProcessor(object): if len(query) == 0 or len(title) == 0: logging.warning("line not match format in test file") continue + # tokenize + if self.tokenizer: + query = self.tokenizer.tokenize(query) + title = self.tokenizer.tokenize(title) query = [ self.vocab[word] for word in query.split(" ") if word in self.vocab diff --git a/PaddleNLP/similarity_net/tokenization.py b/PaddleNLP/similarity_net/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..168f6bbe96c2f39abfe691cf5cca8805c0203e64 --- /dev/null +++ b/PaddleNLP/similarity_net/tokenization.py @@ -0,0 +1,33 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import jieba + +class JiebaTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self): + # Todo: + pass + + def tokenize(self, text): + split_tokens = jieba.cut(text) + split_tokens = " ".join([word for word in split_tokens]) + return split_tokens diff --git a/PaddleNLP/similarity_net/utils.py b/PaddleNLP/similarity_net/utils.py index ee8a281265dcab007eedcb4a288f5d4a64bb336c..0bcf74173d07752d86fb16d396756a504159d022 100644 --- a/PaddleNLP/similarity_net/utils.py +++ b/PaddleNLP/similarity_net/utils.py @@ -214,6 +214,7 @@ class ArgConfig(object): data_g.add_arg("infer_data_dir", str, None, "Directory path to infer data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") + data_g.add_arg("tokenizer", str, None, "Whether or not use user defined tokenizer") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.")