未验证 提交 131a3156 编写于 作者: T tianxin 提交者: GitHub

add JiebaTokenizer demo (#4747)

上级 365fe58a
......@@ -12,7 +12,8 @@
| 模型 | 百度知道 | ECOM |QQSIM | UNICOM |
|:-----------:|:-------------:|:-------------:|:-------------:|:-------------:|
| | AUC | AUC | AUC|正逆序比|
|BOW_Pairwise|0.6767|0.7329|0.7650|1.5630|
|BOW_Pairwise(WordSeg)|0.6767|0.7329|0.7650|1.5630|
|BOW_Pairwise(Jieba)|0.6658|0.7351|0.8431|1.5331|
#### 测试集说明
| 数据集 | 来源 | 垂类 |
|:-----------:|:-------------:|:-------------:|
......@@ -51,7 +52,10 @@ python download.py model
```
#### 评估
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM四个数据集,基于上面的预训练模型,用户可以进入evaluate目录下依次执行下列命令获取测试集评估结果。
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集,基于上面的预训练模型,用户可以进入 evaluate 目录下依次执行下列命令获取测试集评估结果。
我们在以下评估脚本中以 Jieba 切词作为示例,如果您需要自定义切词模块,只需要在 [`tokenization.py`](tokenization.py) 中参考 `JiebaTokenizer` 实现自定义的切词类, 并且在 `evaluate_*.sh` 评估脚本中配置环境变量 `TOKENIZER=${YOUR_TOKENIZER_NAME}` 即可, 如果 `TOKENIZER` 环境变量为空, 则默认输入数据是切词后的数据(示例给出的数据是百度切词工具 WordSeg 切词后的数据)
```shell
sh evaluate_ecom.sh
sh evaluate_qqsim.sh
......
......@@ -96,8 +96,8 @@ def download(url, filename, md5sum):
def download_dataset(dir_path):
BASE_URL = "https://baidu-nlp.bj.bcebos.com/"
DATASET_NAME = "simnet_dataset-1.0.0.tar.gz"
DATASET_MD5 = "ec65b313bc237150ef536a8d26f3c73b"
DATASET_NAME = "simnet_dataset-1.0.1.tar.gz"
DATASET_MD5 = "4a381770178721b539e7cf0f91a8777d"
file_path = os.path.join(dir_path, DATASET_NAME)
url = BASE_URL + DATASET_NAME
......
#get data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.1.tar.gz
tar xzf simnet_dataset-1.0.1.tar.gz
rm simnet_dataset-1.0.1.tar.gz
......@@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=3
export FLAGS_fraction_of_gpu_memory_to_use=0.95
TASK_NAME='simnet'
TEST_DATA_PATH=./data/ecom
VOCAB_PATH=./data/term2id.dict
CKPT_PATH=./model_files
TEST_RESULT_PATH=./evaluate/ecom_test_result
TASK_MODE='pairwise'
CONFIG_PATH=./config/bow_pairwise.json
INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
# use JiebaTokenizer to evaluate
TOKENIZER="JiebaTokenizer"
TEST_DATA_PATH=./data/ecom_raw
# use tokenized data by WordSeg to evaluate
#TOKENIZER=""
#TEST_DATA_PATH=./data/ecom
cd ..
python ./run_classifier.py \
......@@ -23,5 +31,6 @@ python ./run_classifier.py \
--test_result_path ${TEST_RESULT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--tokenizer ${TOKENIZER:-""} \
--task_mode ${TASK_MODE} \
--init_checkpoint ${INIT_CHECKPOINT}
......@@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=3
export FLAGS_fraction_of_gpu_memory_to_use=0.95
TASK_NAME='simnet'
TEST_DATA_PATH=./data/qqsim
VOCAB_PATH=./data/term2id.dict
CKPT_PATH=./model_files
TEST_RESULT_PATH=./evaluate/qqsim_test_result
TASK_MODE='pairwise'
CONFIG_PATH=./config/bow_pairwise.json
INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
# use JiebaTokenizer to evaluate
TOKENIZER="JiebaTokenizer"
TEST_DATA_PATH=./data/qqsim_raw
# use tokenized data by WordSeg to evaluate
#TOKENIZER=""
#TEST_DATA_PATH=./data/qqsim
cd ..
python ./run_classifier.py \
......@@ -23,5 +31,6 @@ python ./run_classifier.py \
--test_result_path ${TEST_RESULT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--tokenizer ${TOKENIZER:-""} \
--task_mode ${TASK_MODE} \
--init_checkpoint ${INIT_CHECKPOINT}
......@@ -4,7 +4,6 @@ export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=3
export FLAGS_fraction_of_gpu_memory_to_use=0.95
TASK_NAME='simnet'
INFER_DATA_PATH=./evaluate/unicom_infer
VOCAB_PATH=./data/term2id.dict
CKPT_PATH=./model_files
INFER_RESULT_PATH=./evaluate/unicom_infer_result
......@@ -12,6 +11,14 @@ TASK_MODE='pairwise'
CONFIG_PATH=./config/bow_pairwise.json
INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
# use JiebaTokenizer to evaluate
TOKENIZER="JiebaTokenizer"
INFER_DATA_PATH=./data/unicom_infer_raw
# use tokenized data by WordSeg to evaluate
#TOKENIZER=""
#INFER_DATA_PATH=./evaluate/unicom_infer
python unicom_split.py
cd ..
python ./run_classifier.py \
......@@ -23,8 +30,8 @@ python ./run_classifier.py \
--infer_result_path ${INFER_RESULT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--tokenizer ${TOKENIZER:-""} \
--task_mode ${TASK_MODE} \
--init_checkpoint ${INIT_CHECKPOINT}
cd evaluate
python unicom_compute_pos_neg.py
......@@ -4,13 +4,21 @@ export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=3
export FLAGS_fraction_of_gpu_memory_to_use=0.95
TASK_NAME='simnet'
TEST_DATA_PATH=./data/zhidao
VOCAB_PATH=./data/term2id.dict
CKPT_PATH=./model_files
TEST_RESULT_PATH=./evaluate/zhidao_test_result
TASK_MODE='pairwise'
CONFIG_PATH=./config/bow_pairwise.json
INIT_CHECKPOINT=./model_files/simnet_bow_pairwise_pretrained_model/
# use JiebaTokenizer to evaluate
TOKENIZER="JiebaTokenizer"
TEST_DATA_PATH=./data/zhidao_raw
# use tokenized data by WordSeg to evaluate
#TOKENIZER=""
#TEST_DATA_PATH=./data/zhidao
cd ..
python ./run_classifier.py \
......@@ -23,5 +31,6 @@ python ./run_classifier.py \
--test_result_path ${TEST_RESULT_PATH} \
--config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--tokenizer ${TOKENIZER:-""} \
--task_mode ${TASK_MODE} \
--init_checkpoint ${INIT_CHECKPOINT}
......@@ -19,6 +19,7 @@ import logging
import numpy as np
import io
import tokenization
class SimNetProcessor(object):
def __init__(self, args, vocab):
......@@ -27,6 +28,10 @@ class SimNetProcessor(object):
self.vocab = vocab
self.valid_label = np.array([])
self.test_label = np.array([])
if args.tokenizer:
self.tokenizer = getattr(tokenization, args.tokenizer)()
else:
self.tokenizer = None
def get_reader(self, mode, epoch=0):
"""
......@@ -48,6 +53,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -71,6 +82,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -95,6 +112,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
pos_title = self.tokenizer.tokenize(pos_title)
neg_title = self.tokenizer.tokenize(neg_title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -130,6 +153,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -153,6 +182,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -178,6 +213,12 @@ class SimNetProcessor(object):
logging.warning(
"line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......@@ -208,6 +249,10 @@ class SimNetProcessor(object):
if len(query) == 0 or len(title) == 0:
logging.warning("line not match format in test file")
continue
# tokenize
if self.tokenizer:
query = self.tokenizer.tokenize(query)
title = self.tokenizer.tokenize(title)
query = [
self.vocab[word] for word in query.split(" ")
if word in self.vocab
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import jieba
class JiebaTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self):
# Todo:
pass
def tokenize(self, text):
split_tokens = jieba.cut(text)
split_tokens = " ".join([word for word in split_tokens])
return split_tokens
......@@ -214,6 +214,7 @@ class ArgConfig(object):
data_g.add_arg("infer_data_dir", str, None,
"Directory path to infer data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("tokenizer", str, None, "Whether or not use user defined tokenizer")
data_g.add_arg("batch_size", int, 32,
"Total examples' number in batch for training.")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册