未验证 提交 e9627028 编写于 作者: T tangwei12 提交者: GitHub

Merge branch 'master' into gru4rec

......@@ -21,7 +21,7 @@ from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance
from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
from paddlerec.core.utils.util import split_files, check_filelist
__all__ = ["DatasetBase", "DataLoader", "QueueDataset"]
......@@ -68,6 +68,8 @@ class DataLoader(DatasetBase):
reader_ins = SlotReader(context["config_yaml"])
if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
dataloader.set_sample_list_generator(reader)
elif hasattr(reader_ins, 'batch_tensor_creator'):
dataloader.set_batch_generator(reader)
else:
dataloader.set_sample_generator(reader, batch_size)
return dataloader
......@@ -119,10 +121,15 @@ class QueueDataset(DatasetBase):
dataset.set_pipe_command(pipe_cmd)
train_data_path = envs.get_global_env(name + "data_path")
file_list = [
os.path.join(train_data_path, x)
for x in os.listdir(train_data_path)
]
hidden_file_list, file_list = check_filelist(
hidden_file_list=[],
data_file_list=[],
train_data_path=train_data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
file_list.sort()
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
......
......@@ -19,7 +19,7 @@ from paddlerec.core.utils.envs import get_global_env
from paddlerec.core.utils.envs import get_runtime_environ
from paddlerec.core.reader import SlotReader
from paddlerec.core.trainer import EngineMode
from paddlerec.core.utils.util import split_files
from paddlerec.core.utils.util import split_files, check_filelist
def dataloader_by_name(readerclass,
......@@ -38,7 +38,13 @@ def dataloader_by_name(readerclass,
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
......@@ -54,8 +60,6 @@ def dataloader_by_name(readerclass,
files = split_files(files, context["fleet"].worker_index(),
context["fleet"].worker_num())
print("file_list : {}".format(files))
reader = reader_class(yaml_file)
reader.init()
......@@ -79,6 +83,10 @@ def dataloader_by_name(readerclass,
if hasattr(reader, 'generate_batch_from_trainfiles'):
return gen_batch_reader()
if hasattr(reader, "batch_tensor_creator"):
return reader.batch_tensor_creator(gen_reader)
return gen_reader
......@@ -92,7 +100,13 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
......@@ -156,7 +170,13 @@ def slotdataloader(readerclass, train, yaml_file, context):
assert package_base is not None
data_path = os.path.join(package_base, data_path.split("::")[1])
files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
hidden_file_list, files = check_filelist(
hidden_file_list=[], data_file_list=[], train_data_path=data_path)
if (hidden_file_list is not None):
print(
"Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}".
format(hidden_file_list))
files.sort()
need_split_files = False
......
......@@ -201,6 +201,28 @@ def split_files(files, trainer_id, trainers):
return trainer_files[trainer_id]
def check_filelist(hidden_file_list, data_file_list, train_data_path):
for root, dirs, files in os.walk(train_data_path):
if (files == None and dirs == None):
return None, None
else:
# use files and dirs
for file_name in files:
file_path = os.path.join(train_data_path, file_name)
if file_name[0] == '.':
hidden_file_list.append(file_path)
else:
data_file_list.append(file_path)
for dirs_name in dirs:
dirs_path = os.path.join(train_data_path, dirs_name)
if dirs_name[0] == '.':
hidden_file_list.append(dirs_path)
else:
#train_data_path = os.path.join(train_data_path, dirs_name)
check_filelist(hidden_file_list, data_file_list, dirs_path)
return hidden_file_list, data_file_list
class CostPrinter(object):
"""
For count cost time && print cost log
......
......@@ -16,9 +16,14 @@ workspace: "models/contentunderstanding/classification"
dataset:
- name: data1
batch_size: 5
batch_size: 10
type: DataLoader
data_path: "{workspace}/data/train_data"
data_path: "{workspace}/data/train"
data_converter: "{workspace}/reader.py"
- name: dataset_infer
batch_size: 2
type: DataLoader
data_path: "{workspace}/data/test"
data_converter: "{workspace}/reader.py"
hyper_parameters:
......@@ -26,23 +31,47 @@ hyper_parameters:
class: Adagrad
learning_rate: 0.001
is_sparse: False
dict_dim: 33257
max_len: 100
cnn_dim: 128
cnn_filter_size1: 1
cnn_filter_size2: 2
cnn_filter_size3: 3
emb_dim: 128
hid_dim: 96
class_dim: 2
mode: runner1
mode: [train_runner,infer_runner]
runner:
- name: runner1
- name: train_runner
class: train
epochs: 10
epochs: 16
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
init_model_path: ""
print_interval: 10
phases: phase_train
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/14" # load model path
phases: phase_infer
phase:
- name: phase1
- name: phase_train
model: "{workspace}/model.py"
dataset_name: data1
thread_num: 1
- name: phase_infer
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# encoding=utf-8
import os
import sys
def build_word_dict():
word_file = "word_dict.txt"
f = open(word_file, "r")
lines = f.readlines()
word_list_ids = range(1, len(lines) + 1)
word_dict = dict(zip([word.strip() for word in lines], word_list_ids))
f.close()
return word_dict
def build_token_data(word_dict, txt_file, token_file):
max_text_size = 100
f = open(txt_file, "r")
fout = open(token_file, "w")
lines = f.readlines()
i = 0
for line in lines:
line = line.strip("\n").split("\t")
text = line[0].strip("\n").split(" ")
tokens = []
label = line[1]
for word in text:
if word in word_dict:
tokens.append(str(word_dict[word]))
else:
tokens.append("0")
seg_len = len(tokens)
if seg_len < 5:
continue
if seg_len >= max_text_size:
tokens = tokens[:max_text_size]
seg_len = max_text_size
else:
tokens = tokens + ["0"] * (max_text_size - seg_len)
text_tokens = " ".join(tokens)
fout.write(text_tokens + " " + str(seg_len) + " " + label + "\n")
if (i + 1) % 100 == 0:
print(str(i + 1) + " lines OK")
i += 1
fout.close()
f.close()
word_dict = build_word_dict()
txt_file = "test.tsv"
token_file = "test.txt"
build_token_data(word_dict, txt_file, token_file)
txt_file = "dev.tsv"
token_file = "dev.txt"
build_token_data(word_dict, txt_file, token_file)
txt_file = "train.tsv"
token_file = "train.txt"
build_token_data(word_dict, txt_file, token_file)
5681 17044 4352 7574 16576 3574 32952 12211 18835 28961 15320 2019 21675 30604 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1
9054 31881 4449 12211 12488 5975 3574 28592 2547 2547 14132 3574 24908 5975 24285 10010 3574 31872 20925 9886 12211 26530 3567 30818 19640 22506 28312 19887 12211 28212 8576 3574 28592 12306 14132 539 33049 9039 14160 113 3567 19675 5511 2111 623 12068 12211 3574 18416 12068 19680 12211 30781 21946 1525 9886 3574 28109 31201 3567 25710 30503 30781 12068 19887 12211 22052 3574 2050 5402 10217 31201 1525 9698 14160 19887 3574 26209 24908 539 33049 9039 32949 8890 29693 3566 3566 11053 30781 26853 3567 3567 0 0 0 0 0 0 0 0 92 0
19640 32771 31526 16576 13354 3574 5087 30781 7902 19037 12211 0 3574 4756 15048 11063 0 15019 16576 2019 29812 2276 22804 13275 2019 24599 12211 30294 6983 26606 1467 3574 18448 8052 16576 23091 32440 11034 16576 3574 1470 6983 1346 31382 13354 3574 11711 10074 28587 5030 19058 16576 2019 16497 6890 12223 30035 6983 1112 18448 30837 11280 24599 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 64 0
7513 19838 3562 32737 15474 3562 1887 15474 0 0 18835 19813 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 1
30325 3574 30788 12211 25843 11533 30150 8937 11309 8690 12211 14166 2200 3574 15802 0 20424 14166 25336 113 16576 11533 24294 12211 26301 16576 3574 28592 16191 12211 8690 13743 0 517 12211 0 0 23958 3574 31019 19680 13841 15337 12211 23958 30781 28630 3574 8690 12700 11280 12211 23958 24908 20409 7481 8052 6094 4002 30245 3574 1526 9904 27032 31347 24006 12211 14166 0 9910 24908 12211 0 2019 25469 17293 27438 29774 13757 24908 22301 28505 25450 12211 14039 3574 28801 4621 4879 3574 623 9904 23958 14166 18417 4895 113 11114 2018 113 100 1
113 16576 17947 28955 12211 24253 3574 22068 30167 12211 14039 30818 28640 7801 2019 7985 30167 5402 6805 0 12211 27645 33067 30151 3574 11110 12211 10710 4549 22708 4308 24908 25975 12211 26957 0 2019 17942 25575 227 19641 1525 13129 113 15492 23224 3574 21163 15565 23273 29004 12452 13233 27573 12211 12046 2019 302 19367 16576 27914 0 0 113 12211 28035 0 13743 13330 24390 12466 1525 12537 3574 18131 2019 9315 25720 27416 2276 15038 18162 10024 28955 3574 10097 18162 26594 12211 21949 3574 30788 12133 26362 1779 27386 21017 14295 1525 454 100 1
33022 4169 19038 25096 3574 19185 113 25010 0 0 10511 17460 28972 6574 3574 1409 0 10010 3574 33022 129 16186 10511 17460 15182 3574 20235 10511 17460 11226 27150 13166 3562 18835 19038 5391 3574 22195 8052 28892 31948 10960 3574 13367 29338 15048 11030 22185 18621 28776 5205 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 52 0
23439 330 0 0 29655 12211 3574 4211 3574 19650 19640 13757 3562 0 0 8990 330 0 0 18920 12211 31924 6688 31857 15364 3574 19641 30781 18416 28952 9209 12211 118 10710 16912 3562 0 0 27771 330 0 0 10126 30325 3574 15374 4348 0 6356 28420 24193 29526 12211 10523 21872 3571 24383 1580 3574 17536 1525 14745 21674 10710 4952 14871 3574 14590 20306 7695 0 32718 3562 0 0 13260 330 0 0 5847 30325 3574 25951 26995 21163 22787 15535 20889 3574 27914 5391 130 2276 15243 6356 0 16576 3562 0 0 100 1
24908 32568 24044 28952 16576 27914 28955 3574 14160 13543 16582 5536 2019 11711 3527 19675 12211 15474 3574 0 14160 31857 30927 2019 18416 9231 12486 12211 20374 3574 1111 30173 19058 3574 31857 31825 3574 30170 15501 21070 2019 31383 19640 5004 3574 31858 12211 6408 2733 8034 24870 12730 12211 16401 2019 18416 19640 9072 18416 12211 2313 12211 20374 3574 18416 2313 25575 19315 31383 20374 20161 24160 3574 11711 3527 3574 31383 20374 31857 28378 2019 1296 5402 23273 16576 2019 16497 28952 2019 9512 15038 5536 3574 11711 10486 15168 19641 21994 0 2019 100 1
0 7902 5402 29107 16576 15535 15535 15535 0 19634 21017 12211 26505 14160 15129 0 15535 15535 15535 26211 4002 9749 23360 16576 15535 15535 15535 26040 15535 15535 15535 15535 11698 32986 19641 0 22421 15535 15535 15535 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 0
28955 17755 3574 1735 18232 19262 12992 12230 3574 18416 30781 7388 19680 19643 16576 12211 3574 28952 9209 3574 16572 22360 2019 19680 19643 6414 12211 2011 27666 2012 3574 13757 32205 3574 14754 11280 12211 22186 7628 1827 17413 3574 19641 30781 31383 12211 4853 2019 33140 113 6047 6414 3310 31383 3574 4654 22360 6580 26147 12211 18696 2019 12306 6414 20539 3574 12680 22360 18624 8051 29384 1146 2019 18046 33188 16582 29384 12211 17311 13222 3574 18416 7453 28961 8014 3574 11711 18416 28961 17658 3574 29384 30781 19893 19643 15073 12211 32171 12211 2019 100 0
28955 12211 30964 14590 28961 4412 29183 29493 6393 17111 29183 11670 12211 19636 23233 28961 4412 29183 25469 1112 16603 14590 16720 28961 9749 32365 23958 12211 33245 1525 11271 29183 29607 4694 8052 12068 32247 26813 29183 12229 6856 3674 330 30326 972 32948 29183 18416 28961 20161 1120 19641 30054 28955 330 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 55 0
28587 26594 16393 14439 20100 8452 12211 11738 3574 20288 2276 2770 9051 29266 3574 27097 12211 0 14648 7902 5827 4308 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 1
19083 3561 20034 30173 8356 3574 18416 18016 6154 13757 30827 23410 4879 5213 3566 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 1
28587 14745 2018 1580 3574 19636 9052 14160 19683 16576 0 0 6007 5361 26370 5391 785 3574 0 17010 28587 27857 19048 20558 9051 3574 6007 0 0 22897 18323 1447 2019 0 0 32391 17536 24961 19048 9749 18448 3574 24283 6356 7648 26789 2019 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 47 0
24908 18920 1400 665 16167 12211 17293 3574 13518 28952 8393 23504 3574 31266 12211 30781 4477 2019 4654 18896 4289 13841 4822 3574 24908 27376 15243 18416 8052 20077 17493 17317 3574 14842 16949 3574 12081 28961 2276 0 14399 20158 14398 16335 12211 3699 7697 6318 69 2019 11924 8053 27376 12211 14039 3574 21210 23273 3574 1732 30818 17942 22561 3083 2019 17268 12700 28892 9108 16576 26203 19037 23872 3574 14988 31773 3574 33140 1725 24908 0 8053 8052 13841 3574 25944 0 2019 4032 5025 13841 19185 12211 14039 3574 665 0 12211 4822 6988 100 1
29728 31619 6149 5402 113 7317 11738 3574 31482 11924 16576 17657 6541 9761 3574 31224 5402 21141 3574 6356 16191 19640 14451 26154 7192 16076 3567 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 0
29302 11364 19059 13652 12211 3574 7898 30781 6356 7961 14954 21752 7340 2019 29302 11401 8328 3574 20384 20034 1460 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 0
4592 12211 31382 11030 3574 7961 6356 136 11714 31881 31478 3574 7957 11533 17413 3574 18835 14451 14550 11533 389 3574 14444 20444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 1
18416 24908 0 5233 22185 12211 29183 18956 30781 9668 8904 15168 18416 16108 29183 18416 29123 4351 28845 11709 11731 30486 21200 3574 4351 32986 8052 13757 11711 16497 25138 18448 3006 30326 20837 6356 16060 11231 13757 18448 11731 29173 3576 18835 27924 11711 11533 11225 3574 17386 15934 7288 0 26216 12211 1542 3574 24908 12511 18416 16060 11231 32842 18448 11731 29173 3574 18956 9668 31387 755 32986 18416 28972 18855 30781 18448 3006 30326 20837 30781 8052 13757 15048 18448 11731 29173 12211 3574 19640 18584 18416 32986 25710 18416 2276 29173 12211 22052 24908 100 0
12 27 13 0 25 52 89 20 39 4 9 1
78 10 61 58 29 79 85 16 46 41 9 1
81 77 44 4 5 57 43 97 42 89 6 0
7 77 86 3 98 89 56 24 7 59 9 1
65 89 99 27 65 98 16 89 42 0 3 0
66 14 48 38 66 5 56 89 98 19 4 1
78 7 10 20 77 16 37 43 59 23 6 1
84 95 28 35 0 82 55 19 13 81 7 0
34 32 98 37 43 51 6 38 20 40 9 0
75 36 13 51 70 24 62 90 32 91 7 1
13 5 49 21 57 21 67 85 74 14 1 0
68 13 86 16 52 50 23 11 65 99 1 1
15 20 75 55 15 90 54 54 15 91 9 0
44 56 15 88 57 3 62 53 89 57 8 1
23 8 40 25 60 33 8 69 44 88 7 1
63 94 5 43 23 70 31 67 21 55 6 0
44 11 64 92 10 37 30 84 19 71 5 1
89 18 71 13 16 58 47 60 77 87 7 1
13 48 56 39 98 53 32 93 13 91 7 0
56 78 67 68 27 11 77 48 45 10 1 1
52 12 14 5 2 8 3 36 33 59 6 0
86 42 91 81 2 9 21 0 44 7 9 1
96 27 82 55 81 30 91 41 91 58 2 1
97 69 76 47 80 62 23 30 87 22 7 1
42 56 25 47 42 18 80 53 15 57 7 0
34 73 75 88 61 79 40 74 87 87 6 1
7 91 9 24 42 60 76 31 10 13 4 0
21 1 46 59 61 54 99 54 89 55 5 1
67 21 1 29 88 5 3 85 39 22 5 1
90 99 7 8 17 77 73 3 32 10 5 0
30 44 26 32 37 74 90 71 42 29 9 1
79 68 3 24 21 37 35 3 76 23 6 1
3 66 7 4 2 88 94 64 47 81 6 1
10 48 16 49 96 93 61 97 84 39 3 1
73 28 67 59 89 92 17 24 52 71 3 1
98 4 35 62 91 2 78 51 72 93 1 1
37 42 96 10 48 49 84 45 59 47 5 1
13 24 7 49 63 78 29 75 45 92 7 1
1 6 95 23 38 34 85 94 33 47 6 1
99 63 65 39 72 73 91 20 16 45 9 0
35 8 81 24 62 0 95 0 52 46 4 1
58 66 88 42 86 94 91 8 18 92 7 0
12 62 56 43 99 31 63 80 11 7 4 1
22 36 1 39 69 20 56 75 17 15 7 0
25 97 62 50 99 98 32 2 98 75 7 1
7 59 98 68 62 19 28 28 60 27 7 0
39 63 43 45 43 11 40 81 4 25 6 0
81 95 27 84 71 45 87 65 40 50 1 0
82 21 69 55 71 92 52 65 90 16 3 0
24 6 5 22 36 34 66 71 3 52 2 0
5 14 66 71 49 10 52 81 32 14 1 0
8 94 52 23 60 27 43 19 89 91 9 0
26 14 36 37 28 94 46 96 11 80 8 1
89 19 77 66 48 75 62 58 90 81 8 1
25 43 95 21 25 81 39 79 9 74 9 0
25 2 64 27 67 36 59 68 99 66 5 1
13 46 41 55 89 93 79 83 32 52 6 0
49 77 57 9 91 49 86 50 32 5 2 0
94 7 53 54 70 69 5 51 59 91 5 1
24 72 94 13 17 12 2 67 0 89 6 1
70 38 19 27 38 87 72 41 98 84 6 1
89 76 82 4 69 64 97 77 88 58 9 0
67 41 99 1 80 38 96 24 67 59 3 1
42 83 50 19 97 99 99 50 46 76 8 1
43 99 63 40 93 15 3 57 11 0 1 0
16 65 31 43 89 37 98 63 29 69 8 1
39 5 65 45 12 82 46 87 82 93 8 0
34 69 82 13 4 20 92 58 46 83 2 1
46 79 87 57 87 23 72 95 37 88 8 0
41 72 81 71 60 15 32 1 9 97 3 0
84 98 15 78 39 82 89 74 46 32 9 0
16 18 92 80 50 44 98 45 15 41 3 1
74 78 81 40 17 65 38 21 27 9 1 0
14 69 68 50 57 11 62 2 89 54 6 0
70 29 79 29 44 56 33 27 25 4 3 1
44 20 87 67 65 41 93 37 99 78 1 1
93 57 87 11 33 40 21 3 47 87 9 1
8 3 24 49 99 48 40 22 99 41 2 0
19 90 9 83 93 22 36 96 44 73 7 1
4 73 2 88 79 90 32 48 45 12 5 0
24 58 34 67 85 62 84 48 14 79 5 1
54 69 19 18 59 78 84 48 61 46 4 0
72 69 95 26 30 74 49 30 95 61 8 0
73 29 46 39 48 30 97 63 89 34 9 1
51 32 44 22 70 69 91 81 74 52 3 0
99 66 89 71 31 42 5 40 21 12 6 0
58 26 59 56 91 49 79 57 57 74 6 1
30 36 59 74 6 30 17 1 99 38 4 0
43 48 77 86 67 25 38 36 3 91 4 1
67 24 51 34 37 8 98 76 84 13 1 1
73 47 88 15 32 99 67 26 28 89 3 1
91 66 11 86 5 12 15 43 79 89 1 1
15 60 43 58 61 0 62 32 98 29 9 0
80 36 78 42 70 52 2 10 42 41 6 1
36 16 46 34 96 39 8 21 86 54 5 1
80 72 13 1 28 49 73 90 81 34 1 0
73 64 86 9 94 49 44 38 47 64 2 0
69 90 69 36 60 45 39 7 41 72 8 0
31 86 54 82 81 77 93 99 68 63 1 1
95 76 97 36 40 12 4 95 59 64 4 1
88 20 64 40 27 11 96 40 41 73 6 0
28 72 70 43 34 54 98 43 29 63 5 0
78 72 4 47 47 38 73 8 65 40 3 1
91 64 51 93 8 78 53 15 42 32 4 0
34 36 45 9 16 0 51 40 90 29 2 1
80 93 65 80 11 19 26 61 29 8 4 0
94 11 60 36 58 98 43 90 64 1 1 0
42 54 89 86 80 72 81 48 19 67 5 0
81 25 30 60 59 20 75 38 75 29 6 0
84 16 48 28 23 20 53 13 32 90 1 0
58 31 77 68 27 88 51 97 70 93 8 1
63 67 85 6 35 22 28 65 8 7 3 0
54 75 93 58 98 9 15 37 61 38 6 1
56 24 50 62 63 47 9 4 58 30 8 1
64 91 32 68 50 90 51 86 52 6 1 1
55 50 46 41 28 1 11 39 75 9 1 0
23 27 98 73 25 7 89 48 7 44 4 1
86 98 68 1 74 46 15 92 59 25 9 1
95 86 72 13 33 60 62 83 96 84 1 0
9 58 37 50 57 16 78 0 21 80 2 0
82 94 74 42 3 60 61 93 34 22 3 1
16 97 97 14 47 50 90 35 9 58 5 0
70 94 82 42 85 88 59 58 6 68 9 0
14 58 24 44 8 29 12 18 26 80 7 0
22 23 7 82 39 28 96 92 23 40 5 1
40 31 72 94 20 81 89 4 42 1 5 0
57 63 71 41 28 2 39 67 90 54 6 0
9 74 4 41 11 31 15 21 44 32 6 1
31 28 66 66 61 78 72 80 82 88 3 1
79 18 1 59 35 62 0 72 78 97 7 0
14 19 30 63 38 37 12 15 54 15 6 1
54 91 37 79 60 35 55 62 94 84 7 1
10 55 78 96 45 55 35 56 54 70 6 1
23 46 15 93 66 11 32 45 74 25 4 0
51 55 9 9 88 59 21 66 87 12 1 1
90 22 38 66 12 9 30 48 55 85 1 1
39 23 82 29 57 76 79 56 3 19 2 0
7 72 76 15 90 23 40 40 33 39 4 1
60 64 34 11 18 18 38 39 53 37 1 1
85 72 51 47 83 90 32 96 78 23 9 1
85 51 96 31 83 70 57 65 15 0 6 0
41 11 56 94 40 6 62 86 68 83 7 0
34 82 44 30 2 2 94 62 41 27 6 1
54 86 50 83 76 65 0 87 80 70 7 0
97 50 65 78 2 90 28 5 12 56 5 1
34 19 68 93 11 9 14 87 22 70 9 0
63 77 27 20 20 37 65 51 29 29 9 1
22 79 98 57 56 97 43 49 4 80 4 1
6 4 35 54 4 36 1 79 85 35 6 0
12 55 68 61 91 43 49 5 93 27 8 0
64 22 69 16 63 20 28 60 13 35 7 1
9 19 60 89 62 29 47 33 6 13 4 0
14 15 39 86 47 75 7 70 57 60 6 1
90 63 12 43 28 46 39 97 83 42 6 0
49 3 3 64 59 46 30 13 61 10 2 0
79 47 29 47 54 38 50 66 18 63 5 1
98 67 1 22 66 32 91 77 63 33 3 0
72 22 10 27 28 44 29 66 71 1 7 0
20 52 19 23 9 38 1 93 83 73 5 0
88 57 22 64 93 66 20 90 78 2 7 1
90 86 41 28 14 25 86 73 7 21 4 0
63 91 0 29 2 78 86 76 9 20 4 1
3 57 91 37 21 85 80 99 18 79 1 1
69 95 36 6 85 47 83 83 61 52 4 0
72 4 34 16 59 78 56 70 27 44 9 1
58 42 6 53 21 7 83 38 86 66 5 0
22 86 22 21 86 22 83 38 62 19 4 0
14 63 20 53 98 76 10 22 35 76 9 1
16 88 13 66 37 33 11 40 61 97 2 1
60 9 98 35 51 11 98 73 67 26 6 1
25 48 87 93 58 58 15 9 23 13 7 1
61 47 47 36 97 22 63 35 9 38 5 1
94 49 41 38 0 81 59 39 13 65 3 0
88 82 71 96 76 16 57 24 72 36 5 1
28 46 8 95 94 86 63 1 42 63 6 0
12 95 29 66 64 77 19 26 73 53 4 0
19 5 52 34 13 62 6 4 25 58 5 0
18 39 39 56 73 29 5 15 13 82 1 1
50 66 99 67 76 25 43 12 24 67 9 0
74 56 61 97 23 63 22 63 6 83 2 1
10 96 13 49 43 20 58 19 99 58 7 1
2 95 31 4 99 91 27 90 85 32 3 0
41 23 20 71 41 75 75 35 16 12 3 1
21 33 87 57 19 27 94 36 80 10 6 0
8 0 25 74 14 61 86 8 42 82 9 0
23 33 91 19 84 99 95 92 29 31 8 0
94 94 5 6 98 23 37 65 14 25 6 1
42 16 39 32 2 20 86 81 90 91 8 0
72 39 20 63 88 52 65 81 77 96 4 0
48 73 65 75 89 36 75 36 11 35 8 0
79 74 3 29 63 20 76 46 8 82 5 0
7 46 38 77 79 92 71 98 30 35 6 0
44 69 93 31 22 68 91 70 32 86 5 0
45 38 77 87 64 44 69 19 28 82 9 0
93 63 92 84 22 44 51 94 4 99 9 0
77 10 49 29 59 55 44 7 95 39 2 0
10 85 99 9 91 29 64 14 50 24 6 1
74 4 21 12 77 36 71 51 50 31 9 1
66 76 28 18 23 49 33 31 6 44 1 1
92 50 90 64 95 58 93 4 78 88 6 1
69 79 76 47 46 26 30 40 33 58 8 1
97 12 87 82 6 18 57 49 49 58 1 1
70 79 55 86 29 88 55 39 17 74 5 1
65 51 45 62 54 17 59 12 29 79 5 0
5 63 82 51 54 97 54 36 57 46 3 0
74 77 52 10 12 9 34 95 2 0 5 0
50 20 22 89 50 70 55 98 80 50 1 0
61 80 7 3 78 36 44 37 90 18 9 0
81 13 55 57 88 81 66 55 18 34 2 1
52 30 54 70 28 56 48 82 67 20 8 1
0 41 15 63 27 90 12 16 56 79 3 0
69 89 54 1 93 10 15 2 25 59 8 0
74 99 17 93 96 82 38 77 98 85 4 0
8 59 17 92 60 21 59 76 55 73 2 1
53 56 79 19 29 94 86 96 62 39 3 1
23 44 25 63 41 94 65 10 8 40 9 1
7 18 80 43 20 70 14 59 72 17 9 0
84 97 79 14 37 64 23 68 8 24 2 0
63 94 98 77 8 62 10 77 63 56 4 0
8 63 74 34 49 22 52 54 44 93 3 0
94 48 92 58 82 48 53 34 96 25 2 0
33 15 3 95 48 93 9 69 44 77 7 1
69 72 80 77 64 24 52 21 36 49 2 0
59 34 54 66 60 19 76 79 16 70 5 1
8 83 9 91 67 79 31 20 31 88 2 0
64 95 46 95 78 63 4 60 66 63 7 1
10 39 78 45 36 4 89 94 68 75 7 0
81 52 70 11 48 15 40 63 29 14 8 1
94 49 30 14 53 12 53 42 77 82 8 1
40 88 46 20 54 84 76 15 2 73 2 1
71 50 79 54 17 58 30 16 17 99 1 1
74 79 74 61 61 36 28 39 89 36 6 0
53 45 45 23 51 32 93 26 10 8 3 0
1 97 6 67 88 20 41 63 49 6 8 0
3 64 41 19 41 80 75 71 69 90 8 0
31 90 38 93 52 0 38 86 41 68 9 1
50 94 53 9 73 59 94 7 24 57 3 0
87 11 4 62 96 7 0 59 46 11 6 1
77 67 56 88 45 62 10 51 86 27 6 1
62 62 59 99 83 84 79 97 56 37 5 0
19 55 0 37 44 44 2 7 54 50 5 1
23 60 11 83 6 48 20 77 54 31 6 0
27 53 52 30 3 70 57 38 47 96 5 0
75 14 5 83 72 46 47 64 14 12 7 0
29 95 36 63 59 49 38 44 13 15 2 1
38 3 70 89 2 94 89 74 33 6 8 1
28 56 49 43 83 34 7 63 36 13 7 0
25 90 23 85 50 65 36 10 64 38 5 0
35 94 48 38 99 71 42 39 61 75 8 1
28 73 34 22 51 8 52 98 74 19 8 1
12 40 65 12 7 96 73 65 12 90 5 0
42 42 48 16 80 14 48 29 29 45 5 0
58 20 4 0 69 99 15 4 16 4 1 1
93 30 90 5 23 63 25 30 99 32 7 1
91 23 20 26 84 78 58 76 58 90 5 1
33 2 36 59 55 9 79 34 92 57 9 0
80 63 84 73 22 40 70 94 59 34 5 0
49 95 50 32 90 22 18 66 46 32 2 0
47 72 3 94 33 78 87 43 11 67 5 0
76 44 86 81 95 48 79 46 11 65 8 1
59 51 97 75 17 5 40 59 32 62 6 0
41 13 58 7 54 84 8 84 27 55 1 0
24 80 44 26 86 99 68 80 81 22 9 0
12 45 16 44 66 76 33 53 3 20 9 0
22 3 79 6 32 38 75 66 15 25 9 1
51 48 26 53 33 26 18 74 9 39 5 1
35 67 89 91 29 81 23 52 19 11 6 0
64 50 43 1 43 49 19 20 84 19 8 0
34 4 9 77 24 61 55 82 42 76 9 0
37 84 94 33 67 60 3 95 78 8 9 0
82 10 54 12 47 23 78 97 6 51 5 0
70 40 38 47 5 38 83 70 37 90 2 0
42 21 62 27 43 47 82 80 88 49 4 0
68 68 67 12 38 13 32 30 93 27 3 1
5 44 98 28 5 81 20 56 10 34 9 1
40 46 11 33 73 62 68 70 66 85 4 0
9 46 11 84 6 31 18 89 66 32 1 1
6 78 44 98 77 29 69 39 62 78 1 0
47 90 18 0 3 8 12 20 51 75 4 1
21 29 74 19 12 29 41 22 63 47 8 1
22 59 64 62 18 89 19 92 87 8 8 0
6 21 24 58 14 53 18 93 62 15 8 0
20 33 88 25 37 52 1 72 74 11 2 0
90 49 28 53 28 80 22 81 0 46 9 0
87 31 51 27 15 31 68 93 5 4 7 1
21 72 60 2 24 79 22 24 77 61 9 0
20 4 6 40 28 14 16 78 58 99 7 1
80 35 98 20 91 35 47 29 3 19 2 1
57 21 24 61 60 39 83 34 53 2 2 0
74 86 78 78 18 44 20 94 85 71 4 1
27 48 44 92 10 18 74 54 25 85 2 0
74 77 28 75 74 91 69 36 95 68 7 0
32 84 17 18 55 79 59 57 21 69 2 1
69 77 40 98 83 40 4 66 39 83 1 1
63 24 32 39 75 92 81 49 2 51 5 1
35 40 84 71 3 16 82 91 44 52 8 0
21 78 66 4 57 27 21 89 4 34 7 1
94 18 57 49 88 26 29 76 56 67 6 0
14 91 71 30 5 36 28 74 16 73 3 1
93 36 43 46 77 44 59 19 56 84 3 0
11 16 2 67 11 96 20 91 20 59 2 1
72 79 26 99 90 71 56 46 35 99 3 0
29 87 20 40 13 14 14 40 61 27 6 0
41 64 28 51 56 52 87 67 37 91 6 1
33 14 5 30 99 54 27 80 54 55 4 1
60 44 73 91 71 53 54 95 59 81 6 0
69 33 11 83 4 53 34 39 43 84 1 0
73 31 19 4 50 20 66 73 94 88 4 0
30 49 41 76 5 21 88 69 76 3 2 0
18 50 27 76 67 38 87 16 52 87 5 1
33 36 80 8 43 82 89 76 37 3 5 0
98 21 61 24 58 13 9 85 56 74 1 1
84 27 50 96 9 56 30 31 85 65 1 1
65 74 40 2 8 40 18 57 30 38 1 1
76 44 64 6 10 32 84 70 74 24 1 1
14 29 59 34 27 8 0 37 27 68 3 0
6 47 5 77 15 41 93 49 59 83 4 1
39 88 43 89 32 98 82 0 5 12 9 0
78 79 30 26 58 6 9 58 37 65 8 1
25 28 66 41 70 87 76 62 29 39 7 1
......@@ -20,28 +20,32 @@ from paddlerec.core.model import ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
self.dict_dim = 100
self.max_len = 10
self.cnn_dim = 32
self.cnn_filter_size = 128
self.emb_dim = 8
self.hid_dim = 128
self.class_dim = 2
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False)
self.dict_dim = envs.get_global_env("hyper_parameters.dict_dim")
self.max_len = envs.get_global_env("hyper_parameters.max_len")
self.cnn_dim = envs.get_global_env("hyper_parameters.cnn_dim")
self.cnn_filter_size1 = envs.get_global_env(
"hyper_parameters.cnn_filter_size1")
self.cnn_filter_size2 = envs.get_global_env(
"hyper_parameters.cnn_filter_size2")
self.cnn_filter_size3 = envs.get_global_env(
"hyper_parameters.cnn_filter_size3")
self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim")
self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim")
self.class_dim = envs.get_global_env("hyper_parameters.class_dim")
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse")
def input_data(self, is_infer=False, **kwargs):
data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
return [data, label, seq_len]
label = fluid.data(name="label", shape=[None, 1], dtype='int64')
return [data, seq_len, label]
def net(self, input, is_infer=False):
""" network definition """
data = input[0]
label = input[1]
seq_len = input[2]
seq_len = input[1]
label = input[2]
# embedding layer
emb = fluid.embedding(
......@@ -50,15 +54,31 @@ class Model(ModelBase):
is_sparse=self.is_sparse)
emb = fluid.layers.sequence_unpad(emb, length=seq_len)
# convolution layer
conv = fluid.nets.sequence_conv_pool(
conv1 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size1,
act="tanh",
pool_type="max")
conv2 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size,
filter_size=self.cnn_filter_size2,
act="tanh",
pool_type="max")
conv3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=self.cnn_dim,
filter_size=self.cnn_filter_size3,
act="tanh",
pool_type="max")
convs_out = fluid.layers.concat(input=[conv1, conv2, conv3], axis=1)
# full connect layer
fc_1 = fluid.layers.fc(input=[conv], size=self.hid_dim)
fc_1 = fluid.layers.fc(input=convs_out, size=self.hid_dim, act="tanh")
# softmax layer
prediction = fluid.layers.fc(input=[fc_1],
size=self.class_dim,
......@@ -70,5 +90,7 @@ class Model(ModelBase):
self._cost = avg_cost
if is_infer:
self._infer_results["acc"] = acc
self._infer_results["loss"] = avg_cost
else:
self._metrics["acc"] = acc
self._metrics["loss"] = avg_cost
......@@ -23,9 +23,10 @@ class Reader(ReaderBase):
def _process_line(self, l):
l = l.strip().split()
data = l[0:10]
seq_len = l[10:11]
label = l[11:]
data = l[0:100]
seq_len = l[100:101]
label = l[101:]
return data, label, seq_len
def generate_sample(self, line):
......@@ -37,6 +38,6 @@ class Reader(ReaderBase):
data = [int(i) for i in data]
label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len]
yield [('data', data), ('label', label), ('seq_len', seq_len)]
yield [('data', data), ('seq_len', seq_len), ('label', label)]
return data_iter
# classification文本分类模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── reader.py #读取程序
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
TextCNN网络是2014年提出的用来做文本分类的卷积神经网络,由于其结构简单、效果好,在文本分类、推荐等NLP领域应用广泛。对于文本分类问题,常见的方法无非就是抽取文本的特征。然后再基于抽取的特征训练一个分类器。 然而研究证明,TextCnn在文本分类问题上有着更加卓越的表现。从直观上理解,TextCNN通过一维卷积来获取句子中n-gram的特征表示。TextCNN对文本浅层特征的抽取能力很强,在短文本领域专注于意图分类时效果很好,应用广泛,且速度较快。
Yoon Kim在论文[EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf)提出了TextCNN并给出基本的结构。将卷积神经网络CNN应用到文本分类任务,利用多个不同size的kernel来提取句子中的关键信息(类似于多窗口大小的ngram),从而能够更好地捕捉局部相关性。模型的主体结构如图所示:
<p align="center">
<img align="center" src="../../../doc/imgs/cnn-ckim2014.png">
<p>
## 数据准备
情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。
情感是人类的一种高级智能行为,为了识别文本的情感倾向,需要深入的语义建模。另外,不同领域(如餐饮、体育)在情感的表达各不相同,因而需要有大规模覆盖各个领域的数据进行模型训练。为此,我们通过基于深度学习的语义模型和大规模数据挖掘解决上述两个问题。效果上,我们基于开源情感倾向分类数据集ChnSentiCorp进行评测,模型在测试集上的准确率如表所示:
| 模型 | dev | test |
| :------| :------ | :------
| TextCNN | 90.75% | 92.19% |
您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt):
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
```
数据格式为一句中文的评价语句,和一个代表情感信息的标签。两者之间用/t分隔,中文的评价语句已经分词,词之间用空格分隔。
```
15.4寸 笔记本 的 键盘 确实 爽 , 基本 跟 台式机 差不多 了 , 蛮 喜欢 数字 小 键盘 , 输 数字 特 方便 , 样子 也 很 美观 , 做工 也 相当 不错 1
跟 心灵 鸡汤 没 什么 本质 区别 嘛 , 至少 我 不 喜欢 这样 读 经典 , 把 经典 都 解读 成 这样 有点 去 中国 化 的 味道 了 0
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/classification
2. 下载并解压数据集,命令如下:
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,将preprocess.py复制到senta_data文件中并执行,即可将数据集中提供的dev.tsv,test.tsv,train.tsv转化为可直接训练的dev.txt,test.txt,train.txt.
```
cp ./data/preprocess.py ./senta_data/
cd senta_data/
python preprocess.py
```
4. 创建存放训练集和测试集的目录,将数据放入目录中。
```
mkdir train
mv train.txt train
mkdir test
mv dev.txt test
cd ..
```
5. 打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将data1下的batch_size值从10改为128
将data1下的data_path改为:{workspace}/senta_data/train
将dataset_infer下的batch_size值从2改为256
将dataset_infer下的data_path改为:{workspace}/senta_data/test
6. 执行命令,开始训练:
```
python -m paddlerec.run -m ./config.yaml
```
7. 运行结果:
```
PaddleRec: Runner infer_runner Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment/14
batch: 1, acc: [0.91796875], loss: [0.2287855]
batch: 2, acc: [0.91796875], loss: [0.22827303]
batch: 3, acc: [0.90234375], loss: [0.27907994]
```
## 进阶使用
## FAQ
......@@ -22,12 +22,12 @@
| 模型 | 简介 | 论文 |
| :------------------: | :--------------------: | :---------: |
| TagSpace | 标签推荐 | [EMNLP 2014][TagSpace: Semantic Embeddings from Hashtags](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/) |
| TagSpace | 标签推荐 | [EMNLP 2014][TagSpace: Semantic Embeddings from Hashtags](https://www.aclweb.org/anthology/D14-1194.pdf) |
| Classification | 文本分类 | [EMNLP 2014][Convolutional neural networks for sentence classication](https://www.aclweb.org/anthology/D14-1181.pdf) |
下面是每个模型的简介(注:图片引用自链接中的论文)
[TagSpace模型](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags)
[TagSpace模型](https://www.aclweb.org/anthology/D14-1194.pdf)
<p align="center">
<img align="center" src="../../doc/imgs/tagspace.png">
<p>
......@@ -37,89 +37,173 @@
<img align="center" src="../../doc/imgs/cnn-ckim2014.png">
<p>
##使用教程(快速开始)
## 使用教程(快速开始)
```
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
cd PaddleRec
python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml
python -m paddlerec.run -m models/contentunderstanding/classification/config.yaml
```
## 使用教程(复现论文)
###注意
### 注意
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。
### 数据处理
**(1)TagSpace**
### 数据处理
[数据地址](https://github.com/mhjabreel/CharCNN/tree/master/data/) , [备份数据地址](https://paddle-tagspace.bj.bcebos.com/data.tar)
数据格式如下
```
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
```
数据解压后,将文本数据转为paddle数据,先将数据放到训练数据目录和测试数据目录
本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,将原始数据存放在raw_big_train_data和raw_big_test_data两个目录下,并在python3环境下运行我们提供的text2paddle.py文件。即可生成可以直接用于训练的数据目录test_big_data和train_big_data。命令如下:
```
mkdir raw_big_train_data
mkdir raw_big_test_data
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
python3 text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
运行脚本text2paddle.py 生成paddle输入格式
运行后的data目录:
```
big_vocab_tag.txt #标签词汇数
big_vocab_text.txt #文本词汇数
data.tar #数据集
raw_big_train_data #数据集中原始的训练集
raw_big_test_data #数据集中原始的测试集
train_data #样例训练集
test_data #样例测试集
train_big_data #数据集经处理后的训练集
test_big_data #数据集经处理后的测试集
text2paddle.py #预处理文件
```
处理完成的数据格式如下:
```
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
2,27 7062 8390 456 407 8 11589 3166 4 7278 31046 33 3898 2897 426 1
2,27 9493 836 355 20871 300 81 19 3 4125 9 449 462 13832 6 16570 1380 2874 5 0 797 236 19 3688 2106 14 8615 7 209 304 4 0 123 1
2,27 12754 637 106 3839 1532 66 0 379 6 0 1246 9 307 33 161 2 8100 36 0 350 123 101 74 181 0 6657 4 0 1222 17195 1
```
### 训练
退回tagspace目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset下sample_1的batch_size值从10改为128
将dataset下sample_1的data_path改为:{workspace}/data/train_big_data
将dataset下inferdata的batch_size值从10改为500
将dataset下inferdata的data_path改为:{workspace}/data/test_big_data
执行命令,开始训练:
```
cd modles/contentunderstanding/tagspace
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
python -m paddlerec.run -m ./config.yaml
```
### 预测
在跑完训练后,模型会开始在验证集上预测。
运行结果:
```
PaddleRec: Runner infer_runner Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment/9
batch: 1, acc: [0.91], loss: [0.02495437]
batch: 2, acc: [0.936], loss: [0.01941476]
batch: 3, acc: [0.918], loss: [0.02116447]
batch: 4, acc: [0.916], loss: [0.0219945]
batch: 5, acc: [0.902], loss: [0.02242816]
batch: 6, acc: [0.9], loss: [0.02421589]
batch: 7, acc: [0.9], loss: [0.026441]
batch: 8, acc: [0.934], loss: [0.01797657]
batch: 9, acc: [0.932], loss: [0.01687362]
batch: 10, acc: [0.926], loss: [0.02047823]
batch: 11, acc: [0.918], loss: [0.01998716]
batch: 12, acc: [0.898], loss: [0.0229556]
batch: 13, acc: [0.928], loss: [0.01736144]
batch: 14, acc: [0.93], loss: [0.01911209]
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
**(2)Classification**
### 数据处理
情感倾向分析(Sentiment Classification,简称Senta)针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有利的决策支持。
情感是人类的一种高级智能行为,为了识别文本的情感倾向,需要深入的语义建模。另外,不同领域(如餐饮、体育)在情感的表达各不相同,因而需要有大规模覆盖各个领域的数据进行模型训练。为此,我们通过基于深度学习的语义模型和大规模数据挖掘解决上述两个问题。效果上,我们基于开源情感倾向分类数据集ChnSentiCorp进行评测。
您可以直接执行以下命令下载我们分词完毕后的数据集,文件解压之后,senta_data目录下会存在训练数据(train.tsv)、开发集数据(dev.tsv)、测试集数据(test.tsv)以及对应的词典(word_dict.txt):
```
wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
```
**(2)Classification**
数据格式为一句中文的评价语句,和一个代表情感信息的标签。两者之间用/t分隔,中文的评价语句已经分词,词之间用空格分隔。
### 训练
```
cd modles/contentunderstanding/classification
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
15.4寸 笔记本 的 键盘 确实 爽 , 基本 跟 台式机 差不多 了 , 蛮 喜欢 数字 小 键盘 , 输 数字 特 方便 , 样子 也 很 美观 , 做工 也 相当 不错 1
跟 心灵 鸡汤 没 什么 本质 区别 嘛 , 至少 我 不 喜欢 这样 读 经典 , 把 经典 都 解读 成 这样 有点 去 中国 化 的 味道 了 0
```
本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,将preprocess.py复制到senta_data文件中并执行,即可将数据集中提供的dev.tsv,test.tsv,train.tsv转化为可直接训练的dev.txt,test.txt,train.txt.
```
cp ./data/preprocess.py ./senta_data/
cd senta_data/
python preprocess.py
```
### 预测
### 训练
创建存放训练集和测试集的目录,将数据放入目录中。
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释
mkdir train
mv train.txt train
mkdir test
mv dev.txt test
cd ..
```
打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将data1下的batch_size值从10改为128
将data1下的data_path改为:{workspace}/senta_data/train
将dataset_infer下的batch_size值从2改为256
将dataset_infer下的data_path改为:{workspace}/senta_data/test
# 修改完config.yaml后 执行:
执行命令,开始训练:
```
python -m paddlerec.run -m ./config.yaml
```
### 预测
在跑完训练后,模型会开始在验证集上预测。
运行结果:
```
PaddleRec: Runner infer_runner Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment/14
batch: 1, acc: [0.91796875], loss: [0.2287855]
batch: 2, acc: [0.91796875], loss: [0.22827303]
batch: 3, acc: [0.90234375], loss: [0.27907994]
```
## 效果对比
### 模型效果 (测试)
| 数据集 | 模型 | loss | auc | acc | mae |
| :------------------: | :--------------------: | :---------: |:---------: | :---------: |:---------: |
| ag news dataset | TagSpace | -- | -- | -- | -- |
| -- | Classification | -- | -- | -- | -- |
| 数据集 | 模型 | loss | acc |
| :------------------: | :--------------------: | :---------: |:---------: |
| ag news dataset | TagSpace | 0.0198 | 0.9177 |
| ChnSentiCorp | Classification | 0.2282 | 0.9127 |
......@@ -16,16 +16,21 @@ workspace: "models/contentunderstanding/tagspace"
dataset:
- name: sample_1
type: QueueDataset
batch_size: 5
type: DataLoader
batch_size: 10
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
- name: inferdata
type: DataLoader
batch_size: 10
data_path: "{workspace}/data/test_data"
data_converter: "{workspace}/reader.py"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
vocab_text_size: 11447
vocab_text_size: 75378
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
......@@ -34,22 +39,34 @@ hyper_parameters:
neg_size: 3
num_devices: 1
mode: runner1
mode: [runner1,infer_runner]
runner:
- name: runner1
class: train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/9" # load model path
phases: phase_infer
phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: sample_1
thread_num: 1
- name: phase_infer
model: "{workspace}/model.py"
dataset_name: inferdata
thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import six
import collections
import os
import csv
import re
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(column_num, input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+', row[column_num].strip()):
word_freq[w] += 1
return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
files = os.listdir(test_dir)
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",")
text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw]
for w in l:
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
output_test_dir)
train_dir = sys.argv[1]
test_dir = sys.argv[2]
output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag)
......@@ -16,7 +16,6 @@ import paddle.fluid as fluid
import paddle.fluid.layers.nn as nn
import paddle.fluid.layers.tensor as tensor
import paddle.fluid.layers.control_flow as cf
from paddlerec.core.model import ModelBase
from paddlerec.core.utils import envs
......@@ -98,14 +97,19 @@ class Model(ModelBase):
tensor.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_part2)
avg_cost = nn.mean(loss_part3)
avg_cost = fluid.layers.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
label_ones = fluid.layers.fill_constant_batch_size_like(
input=cos_neg, dtype='float32', shape=[-1, 1], value=1.0)
correct = nn.reduce_sum(less)
total = fluid.layers.reduce_sum(label_ones)
acc = fluid.layers.elementwise_div(correct, total)
self._cost = avg_cost
if is_infer:
self._infer_results["correct"] = correct
self._infer_results["cos_pos"] = cos_pos
self._infer_results["acc"] = acc
self._infer_results["loss"] = self._cost
else:
self._metrics["correct"] = correct
self._metrics["cos_pos"] = cos_pos
self._metrics["acc"] = acc
self._metrics["loss"] = self._cost
# tagspace文本分类模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train_data
├── small_train.csv #训练数据样例
├── test_data
├── small_test.csv #测试数据样例
├── text2paddle.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── reader.py #读取程序
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
tagspace模型是一种对文本打标签的方法,它主要学习从短文到相关主题标签的映射。论文中主要利用CNN做doc向量, 然后优化 f(w,t+),f(w,t-)的距离作为目标函数,得到了 t(标签)和doc在一个特征空间的向量表达,这样就可以找 doc的hashtags了。
论文[TAGSPACE: Semantic Embeddings from Hashtags](https://www.aclweb.org/anthology/D14-1194.pdf)中的网络结构如图所示,一层输入层,一个卷积层,一个pooling层以及最后一个全连接层进行降维。
<p align="center">
<img align="center" src="../../../doc/imgs/tagspace.png">
<p>
## 数据准备
[数据地址](https://github.com/mhjabreel/CharCNN/tree/master/data/) , [备份数据地址](https://paddle-tagspace.bj.bcebos.com/data.tar)
数据格式如下:
```
"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/contentunderstanding/tagspace/config.yaml
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/contentunderstanding/tagspace
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget https://paddle-tagspace.bj.bcebos.com/data.tar
tar -xvf data.tar
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,将原始数据存放在raw_big_train_data和raw_big_test_data两个目录下,并在python3环境下运行我们提供的text2paddle.py文件。即可生成可以直接用于训练的数据目录test_big_data和train_big_data。命令如下:
```
mkdir raw_big_train_data
mkdir raw_big_test_data
mv train.csv raw_big_train_data
mv test.csv raw_big_test_data
python3 text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
```
运行后的data目录:
```
big_vocab_tag.txt #标签词汇数
big_vocab_text.txt #文本词汇数
data.tar #数据集
raw_big_train_data #数据集中原始的训练集
raw_big_test_data #数据集中原始的测试集
train_data #样例训练集
test_data #样例测试集
train_big_data #数据集经处理后的训练集
test_big_data #数据集经处理后的测试集
text2paddle.py #预处理文件
```
处理完成的数据格式如下:
```
2,27 7062 8390 456 407 8 11589 3166 4 7278 31046 33 3898 2897 426 1
2,27 9493 836 355 20871 300 81 19 3 4125 9 449 462 13832 6 16570 1380 2874 5 0 797 236 19 3688 2106 14 8615 7 209 304 4 0 123 1
2,27 12754 637 106 3839 1532 66 0 379 6 0 1246 9 307 33 161 2 8100 36 0 350 123 101 74 181 0 6657 4 0 1222 17195 1
```
4. 退回tagspace目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset下sample_1的batch_size值从10改为128
将dataset下sample_1的data_path改为:{workspace}/data/train_big_data
将dataset下inferdata的batch_size值从10改为500
将dataset下inferdata的data_path改为:{workspace}/data/test_big_data
5. 执行命令,开始训练:
```
python -m paddlerec.run -m ./config.yaml
```
6. 运行结果:
```
PaddleRec: Runner infer_runner Begin
Executor Mode: infer
processor_register begin
Running SingleInstance.
Running SingleNetwork.
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment/9
batch: 1, acc: [0.91], loss: [0.02495437]
batch: 2, acc: [0.936], loss: [0.01941476]
batch: 3, acc: [0.918], loss: [0.02116447]
batch: 4, acc: [0.916], loss: [0.0219945]
batch: 5, acc: [0.902], loss: [0.02242816]
batch: 6, acc: [0.9], loss: [0.02421589]
batch: 7, acc: [0.9], loss: [0.026441]
batch: 8, acc: [0.934], loss: [0.01797657]
batch: 9, acc: [0.932], loss: [0.01687362]
batch: 10, acc: [0.926], loss: [0.02047823]
batch: 11, acc: [0.918], loss: [0.01998716]
batch: 12, acc: [0.898], loss: [0.0229556]
batch: 13, acc: [0.928], loss: [0.01736144]
batch: 14, acc: [0.93], loss: [0.01911209]
```
## 进阶使用
## FAQ
......@@ -17,50 +17,52 @@ workspace: "models/match/dssm"
dataset:
- name: dataset_train
batch_size: 4
type: QueueDataset
batch_size: 8
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/synthetic_reader.py"
- name: dataset_infer
batch_size: 1
type: QueueDataset
data_path: "{workspace}/data/train"
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
data_converter: "{workspace}/synthetic_evaluate_reader.py"
hyper_parameters:
optimizer:
class: sgd
learning_rate: 0.01
learning_rate: 0.001
strategy: async
trigram_d: 1000
neg_num: 4
trigram_d: 1439
neg_num: 1
fc_sizes: [300, 300, 128]
fc_acts: ['tanh', 'tanh', 'tanh']
mode: train_runner
mode: [train_runner,infer_runner]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: train
# num of epochs
epochs: 4
epochs: 3
# device to run training or infer
device: cpu
save_checkpoint_interval: 2 # save model interval of epochs
save_inference_interval: 4 # save inference
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 2
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/2" # load model path
phases: phase2
# runner will run all the phase in each epoch
phase:
......@@ -68,7 +70,7 @@ phase:
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
- name: phase2
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#encoding=utf-8
import os
import sys
import numpy as np
import random
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
#建立字典
word_dict = {}
for line in lines:
line = line.strip().split("\t")
text = line[0].split(" ") + line[1].split(" ")
for word in text:
if word in word_dict:
continue
else:
word_dict[word] = len(word_dict) + 1
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
lines = [line.strip().split("\t") for line in lines]
#建立以query为key,以负例为value的字典
neg_dict = {}
for line in lines:
if line[2] == "0":
if line[0] in neg_dict:
neg_dict[line[0]].append(line[1])
else:
neg_dict[line[0]] = [line[1]]
#建立以query为key,以正例为value的字典
pos_dict = {}
for line in lines:
if line[2] == "1":
if line[0] in pos_dict:
pos_dict[line[0]].append(line[1])
else:
pos_dict[line[0]] = [line[1]]
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
#获得训练集
train_set = []
for query in train_query:
for pos in pos_dict[query]:
if query not in neg_dict:
continue
for neg in neg_dict[query]:
train_set.append([query, pos, neg])
random.shuffle(train_set)
#获得测试集
test_set = []
for query in test_query:
for pos in pos_dict[query]:
test_set.append([query, pos, 1])
if query not in neg_dict:
continue
for neg in neg_dict[query]:
test_set.append([query, neg, 0])
random.shuffle(test_set)
#训练集中的query,pos,neg转化为词袋
f = open("train.txt", "w")
for line in train_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ")
query_token = [0] * (len(word_dict) + 1)
for word in query:
query_token[word_dict[word]] = 1
pos_token = [0] * (len(word_dict) + 1)
for word in pos:
pos_token[word_dict[word]] = 1
neg_token = [0] * (len(word_dict) + 1)
for word in neg:
neg_token[word_dict[word]] = 1
f.write(','.join([str(x) for x in query_token]) + "\t" + ','.join([
str(x) for x in pos_token
]) + "\t" + ','.join([str(x) for x in neg_token]) + "\n")
f.close()
#测试集中的query和pos转化为词袋
f = open("test.txt", "w")
fa = open("label.txt", "w")
for line in test_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
label = line[2]
query_token = [0] * (len(word_dict) + 1)
for word in query:
query_token[word_dict[word]] = 1
pos_token = [0] * (len(word_dict) + 1)
for word in pos:
pos_token[word_dict[word]] = 1
f.write(','.join([str(x) for x in query_token]) + "\t" + ','.join(
[str(x) for x in pos_token]) + "\n")
fa.write(str(label) + "\n")
f.close()
fa.close()
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -73,6 +73,7 @@ class Model(ModelBase):
query_fc = fc(inputs[0], self.hidden_layers, self.hidden_acts,
['query_l1', 'query_l2', 'query_l3'])
doc_pos_fc = fc(inputs[1], self.hidden_layers, self.hidden_acts,
['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
......@@ -93,7 +94,7 @@ class Model(ModelBase):
prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost
......
# DSSM文本匹配模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── synthetic_reader.py #读取训练集的程序
├── synthetic_evaluate_reader.py #读取测试集的程序
├── transform.py #将数据整理成合适的格式方便计算指标
├── run.sh #全量数据集中的训练脚本,从训练到预测并计算指标
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
DSSM是Deep Structured Semantic Model的缩写,即我们通常说的基于深度网络的语义模型,其核心思想是将query和doc映射到到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。
DSSM 的输入采用 BOW(Bag of words)的方式,相当于把字向量的位置信息抛弃了,整个句子里的词都放在一个袋子里了。将一个句子用这种方式转化为一个向量输入DNN中。
Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示,然后通过softmax 函数选出与Query语义最相似的样本 Doc 。
模型的具体细节可以阅读论文[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf):
<p align="center">
<img align="center" src="../../../doc/imgs/dssm.png">
<p>
## 数据准备
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
```
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下执行下面的命令即可快速启动训练:
```
python -m paddlerec.run -m models/match/dssm/config.yaml
```
输出结果示例:
```
PaddleRec: Runner train_runner Begin
Executor Mode: train
processor_register begin
Running SingleInstance.
Running SingleNetwork.
file_list : ['models/match/dssm/data/train/train.txt']
Running SingleStartup.
Running SingleRunner.
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 06:56:26.224299 31061 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 06:56:26.231163 31061 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 06:56:26.237023 31061 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 06:56:26.240788 31061 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
batch: 2, LOSS: [4.538238]
batch: 4, LOSS: [4.16424]
batch: 6, LOSS: [3.8121371]
batch: 8, LOSS: [3.4250507]
batch: 10, LOSS: [3.2285979]
batch: 12, LOSS: [3.2116117]
batch: 14, LOSS: [3.1406002]
epoch 0 done, use time: 0.357971906662, global metrics: LOSS=[3.0968776]
batch: 2, LOSS: [2.6843479]
batch: 4, LOSS: [2.546976]
batch: 6, LOSS: [2.4103594]
batch: 8, LOSS: [2.301374]
batch: 10, LOSS: [2.264183]
batch: 12, LOSS: [2.315862]
batch: 14, LOSS: [2.3409634]
epoch 1 done, use time: 0.22123003006, global metrics: LOSS=[2.344321]
batch: 2, LOSS: [2.0882485]
batch: 4, LOSS: [2.006743]
batch: 6, LOSS: [1.9231766]
batch: 8, LOSS: [1.8850241]
batch: 10, LOSS: [1.8829436]
batch: 12, LOSS: [1.9336565]
batch: 14, LOSS: [1.9784685]
epoch 2 done, use time: 0.212922096252, global metrics: LOSS=[1.9934461]
PaddleRec Finish
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/match/dssm
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。命令如下:
```
mv data/zhidao ./
rm -rf data
python3 preprocess.py
rm -f ./train/train.txt
mv train.txt ./train
rm -f ./test/test.txt
mv test.txt test
cd ..
```
经过预处理的格式:
训练集为三个稀疏的BOW方式的向量:query,pos,neg
测试集为两个稀疏的BOW方式的向量:query,pos
label.txt中对应的测试集中的标签
4. 退回dssm目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset_train中的batch_size从8改为128
将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候,end中第一个参数也需要随之变化
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动transform.py整合数据,最后计算出正逆序指标:
```
sh run.sh
```
输出结果示例:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
75
pnr: 2.25581395349
query_num: 11
pair_num: 184 184
equal_num: 44
正序率: 0.692857142857
97 43
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
## 进阶使用
## FAQ
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
grep -i "query_doc_sim" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt
rm -f result1.txt
rm -f result2.txt
python transform.py
sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
rm -f pair.txt
python ../../../tools/cal_pos_neg.py result.txt
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
import sklearn.metrics
label = []
filename = './data/label.txt'
f = open(filename, "r")
f.readline()
num = 0
for line in f.readlines():
num = num + 1
line = line.strip()
label.append(line)
f.close()
print(num)
filename = './result.txt'
sim = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
sim.append(float(line))
filename = './data/test/test.txt'
f = open(filename, "r")
f.readline()
query = []
for line in f.readlines():
line = line.strip().split("\t")
query.append(line[0])
f.close()
filename = 'pair.txt'
f = open(filename, "w")
for i in range(len(sim)):
f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
f.close()
# match-pyramid文本匹配模型
## 介绍
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── process.py #数据处理脚本
├── relation.test.fold1.txt #评估计算指标时用到的关系文件
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── data_process.sh #数据下载和处理脚本
├── eval.py #计算指标的评估程序
├── run.sh #一键运行程序
├── test_reader.py #测试集读取程序
├── train_reader.py #训练集读取程序
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [论文复现](#论文复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
在许多自然语言处理任务中,匹配两个文本是一个基本问题。一种有效的方法是从单词,短语和句子中提取有意义的匹配模式以产生匹配分数。受卷积神经网络在图像识别中的成功启发,神经元可以根据提取的基本视觉模式(例如定向的边角和边角)捕获许多复杂的模式,所以我们尝试将文本匹配建模为图像识别问题。本模型对齐原作者庞亮开源的tensorflow代码:https://github.com/pl8787/MatchPyramid-TensorFlow/blob/master/model/model_mp.py, 实现了下述论文中提出的Match-Pyramid模型:
```text
......@@ -19,8 +55,23 @@
3.关系文件:关系文件被用来存储两个句子之间的关系,如query 和document之间的关系。例如:relation.train.fold1.txt, relation.test.fold1.txt
4.嵌入层文件:我们将预训练的词向量存储在嵌入文件中。例如:embed_wiki-pdc_d50_norm
## 数据下载和预处理
本文提供了数据集的下载以及一键生成训练和测试数据的预处理脚本,您可以直接一键运行:bash data_process.sh
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/match/match-pyramid/config.yaml
```
## 论文复现
1. 确认您当前所在目录为PaddleRec/models/match/match-pyramid
2. 本文提供了原数据集的下载以及一键生成训练和测试数据的预处理脚本,您可以直接一键运行:bash data_process.sh
执行该脚本,会从国内源的服务器上下载Letor07数据集,删除掉data文件夹中原有的relation.test.fold1.txt和relation.train.fold1.txt,并将完整的数据集解压到data文件夹。随后运行 process.py 将全量训练数据放置于`./data/train`,全量测试数据放置于`./data/test`。并生成用于初始化embedding层的embedding.npy文件
执行该脚本的理想输出为:
```
......@@ -69,9 +120,11 @@ data/embed_wiki-pdc_d50_norm
[./data/relation.test.fold1.txt]
Instance size: 13652
```
3. 打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
## 一键训练并测试评估
本文提供了一键执行训练,测试和评估的脚本,您可以直接一键运行:bash run.sh
4. 随后,您直接一键运行:bash run.sh 即可得到复现的论文效果
执行该脚本后,会执行python -m paddlerec.run -m ./config.yaml 命令开始训练并测试模型,将测试的结果保存到result.txt文件,最后通过执行eval.py进行评估得到数据的map指标
执行该脚本的理想输出为:
```
......@@ -79,16 +132,7 @@ data/embed_wiki-pdc_d50_norm
13651
336
('map=', 0.420878322843591)
```
## 每个文件的作用
paddlerec可以:
通过config.yaml规定模型的参数
通过model.py规定模型的组网
使用train_reader.py读取训练集中的数据
使用test_reader.py读取测试集中的数据。
本文额外提供:
data_process.sh用来一键处理数据
run.sh用来一键启动训练,直接得出测试结果
eval.py通过保存的测试结果,计算map指标
如需详细了解paddlerec的使用方法请参考https://github.com/PaddlePaddle/PaddleRec/blob/master/README_CN.md 页面下方的教程。
```
## 进阶使用
## FAQ
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
grep -A1 "prediction" ./result1.txt >./result.txt
grep -i "prediction" ./result1.txt >./result.txt
rm -f result1.txt
python eval.py
......@@ -18,12 +18,12 @@ workspace: "models/match/multiview-simnet"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 2
batch_size: 128
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
sparse_slots: "1 2 3"
sparse_slots: "0 1 2"
- name: dataset_infer # name
batch_size: 2
batch_size: 1
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
sparse_slots: "1 2"
......@@ -34,17 +34,17 @@ hyper_parameters:
class: Adam
learning_rate: 0.0001
strategy: async
query_encoder: "bow"
title_encoder: "bow"
query_encoder: "gru"
title_encoder: "gru"
query_encode_dim: 128
title_encode_dim: 128
sparse_feature_dim: 1000001
sparse_feature_dim: 1439
embedding_dim: 128
hidden_size: 128
margin: 0.1
# select runner by name
mode: train_runner
mode: [train_runner,infer_runner]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
......@@ -62,12 +62,14 @@ runner:
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 1
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/0" # load model path
init_model_path: "increment/1" # load model path
phases: phase2
# runner will run all the phase in each epoch
phase:
......@@ -75,7 +77,7 @@ phase:
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
- name: phase2
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#encoding=utf-8
import os
import sys
import numpy as np
import random
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
#建立字典
word_dict = {}
for line in lines:
line = line.strip().split("\t")
text = line[0].split(" ") + line[1].split(" ")
for word in text:
if word in word_dict:
continue
else:
word_dict[word] = len(word_dict) + 1
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
lines = [line.strip().split("\t") for line in lines]
#建立以query为key,以负例为value的字典
neg_dict = {}
for line in lines:
if line[2] == "0":
if line[0] in neg_dict:
neg_dict[line[0]].append(line[1])
else:
neg_dict[line[0]] = [line[1]]
#建立以query为key,以正例为value的字典
pos_dict = {}
for line in lines:
if line[2] == "1":
if line[0] in pos_dict:
pos_dict[line[0]].append(line[1])
else:
pos_dict[line[0]] = [line[1]]
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query_list))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
#获得训练集
train_set = []
for query in train_query:
for pos in pos_dict[query]:
if query not in neg_dict:
continue
for neg in neg_dict[query]:
train_set.append([query, pos, neg])
random.shuffle(train_set)
#获得测试集
test_set = []
for query in test_query:
for pos in pos_dict[query]:
test_set.append([query, pos, 1])
if query not in neg_dict:
continue
for neg in neg_dict[query]:
test_set.append([query, neg, 0])
random.shuffle(test_set)
#训练集中的query,pos,neg转化格式
f = open("train.txt", "w")
for line in train_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ")
query_list = []
for word in query:
query_list.append(word_dict[word])
pos_list = []
for word in pos:
pos_list.append(word_dict[word])
neg_list = []
for word in neg:
neg_list.append(word_dict[word])
f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join([
"1:" + str(x) for x in pos_list
]) + " " + ' '.join(["2:" + str(x) for x in neg_list]) + "\n")
f.close()
#测试集中的query和pos转化格式
f = open("test.txt", "w")
fa = open("label.txt", "w")
fb = open("testquery.txt", "w")
for line in test_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
label = line[2]
query_list = []
for word in query:
query_list.append(word_dict[word])
pos_list = []
for word in pos:
pos_list.append(word_dict[word])
f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join(
["1:" + str(x) for x in pos_list]) + "\n")
fa.write(str(label) + "\n")
fb.write(','.join([str(x) for x in query_list]) + "\n")
f.close()
fa.close()
fb.close()
224289:0 126379:0 284519:0 549329:0 750666:0 393772:0 586898:0 736887:0 48785:0 906517:0 229162:1 483485:1 739835:1 29957:1 694497:1 997508:1 556876:1 717791:1 232176:1 430356:1
366182:0 82062:0 708883:0 949128:0 798964:0 639103:0 409033:0 79301:0 405607:0 342616:0 61552:1 560547:1 3760:1 754734:1 98496:1 472427:1 979596:1 750283:1 492028:1 801383:1
969571:0 405187:0 756217:0 563640:0 572168:0 881952:0 446260:0 692177:0 994140:0 485393:0 509081:1 297377:1 465399:1 934708:1 430949:1 135651:1 484531:1 385306:1 463957:1 996004:1
436320:0 423131:0 963969:0 78345:0 879550:0 458203:0 684397:0 956202:0 989802:0 526101:0 852446:1 182545:1 625656:1 674856:1 422648:1 74100:1 48372:1 850830:1 336087:1 178251:1
242683:0 118677:0 20731:0 970617:0 355890:0 739613:0 926695:0 963639:0 201043:0 611907:0 115309:1 310984:1 615584:1 638886:1 575934:1 889389:1 974807:1 570987:1 532482:1 911925:1
954007:0 122623:0 168195:0 348901:0 217880:0 84759:0 925763:0 436382:0 573742:0 942921:0 553377:1 835046:1 137907:1 933870:1 766585:1 48483:1 543079:1 889467:1 521705:1 906676:1
798690:0 617323:0 553266:0 232924:0 159461:0 404822:0 52992:0 364854:0 913876:0 547974:0 559472:1 748595:1 71793:1 357331:1 606888:1 477051:1 291481:1 89363:1 503881:1 423029:1
228207:0 785250:0 661149:0 803304:0 478781:0 495202:0 804509:0 273065:0 26123:0 810840:0 801871:1 146772:1 421009:1 752344:1 946358:1 531668:1 5771:1 191294:1 627329:1 434664:1
984628:0 762075:0 505288:0 48519:0 72492:0 26568:0 684085:0 613095:0 781547:0 895829:0 280541:1 903234:1 708065:1 386658:1 331060:1 3693:1 279760:1 459579:1 423552:1 962594:1
674172:0 39271:0 646093:0 757969:0 553251:0 734960:0 967186:0 856940:0 617246:0 376452:0 113050:1 472707:1 975057:1 865095:1 155824:1 389921:1 205520:1 513667:1 163588:1 953463:1
0:908 0:159 0:909 0:910 0:109 1:911 1:159 1:909 1:910 1:109
0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:32 1:148 1:212 1:48 1:65 1:65 1:211 1:210 1:33 1:213 1:214 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:79 1:80 1:81 1:13 1:78 1:1 1:692 1:144 1:85 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:75 1:83 1:78 1:86 1:270 1:85 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:78 1:79 1:80 1:235 1:144 1:236 1:169 1:237 1:138 1:48 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:227 1:228 1:13 1:75 1:229 1:80 1:81 1:4 1:78 1:14 1:39
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:176 1:113 1:68 1:1357
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:731 1:22
0:155 0:837 0:838 0:839 1:155 1:838 1:1296
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:83 1:100 1:79 1:81 1:4 1:86 1:82 1:94 1:84 1:85 1:48 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:113 1:68 1:114 1:21 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:168 1:13 1:80 1:81 1:144 1:82 1:169 1:170 1:171 1:172 1:148 1:173 1:174
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:449 1:450 1:80 1:10 1:451 1:13 1:452 1:453 1:6 1:85 1:168 1:81 1:4 1:78 1:22 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:230 1:113 1:68 1:114 1:13 1:144 1:113 1:68 1:114
0:155 0:837 0:838 0:839 1:1371 1:155 1:578 1:838 1:21 1:839 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:172 1:83 1:170 1:138 1:48
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:698 1:67 1:96 1:696
0:222 0:223 0:224 0:225 0:67 0:96 1:624 1:1238 1:222 1:223 1:224 1:67 1:96
0:210 0:10 0:211 0:14 0:212 1:211 1:614 1:214 1:86 1:82 1:48 1:65 1:65 1:155 1:212
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:1406 1:1407
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:1222 1:116 1:113 1:68 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:695 1:96 1:696 1:1128
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:233 1:1350 1:4 1:1074 1:113 1:68 1:21 1:70 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:419 1:96 1:1054 1:1055
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:79 1:80 1:81 1:86 1:82 1:83 1:84 1:138 1:48 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:67 1:96
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:226 1:223 1:224 1:67 1:96
0:210 0:10 0:211 0:14 0:212 1:210 1:211 1:32 1:4 1:474 1:637
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:1 1:1211 1:178 1:78 1:13 1:79 1:80 1:81 1:14 1:85 1:22
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:920 1:456 1:153 1:152 1:14 1:921
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:144 1:113 1:68 1:115
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1106 1:78 1:1107 1:13 1:170 1:1108 1:13 1:1109 1:75 1:79 1:80 1:81 1:13 1:177 1:85 1:577 1:78 1:32 1:170 1:86 1:82 1:48 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:135 1:78 1:91 1:79 1:78 1:136 1:81 1:4 1:137 1:86 1:82 1:83 1:84 1:138 1:48
0:421 0:456 0:153 0:152 0:159 0:457 1:153 1:421 1:456 1:152 1:475 1:68 1:476
0:155 0:837 0:838 0:839 1:155 1:838 1:839
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:576 1:168 1:80 1:81 1:13 1:86 1:80 1:83 1:170 1:48 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:33 1:67 1:79 1:121 1:80 1:81 1:276 1:162 1:1071 1:1072 1:103 1:13 1:167 1:1073 1:164 1:86 1:8 1:83 1:170 1:6 1:138 1:48 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:936 1:223 1:4 1:937 1:224 1:67 1:96 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:144 1:113 1:68 1:1155 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:13 1:86 1:82 1:1280 1:4 1:170 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:4 1:144 1:8 1:169 1:84 1:171 1:172 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:170 1:65 1:65 1:168 1:138 1:80 1:1212 1:81 1:65 1:65 1:13 1:65 1:65 1:452 1:172 1:538 1:6 1:80 1:173
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:13 1:75 1:68 1:734 1:48 1:22 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:618 1:81 1:14 1:39 1:86 1:82 1:83 1:170 1:138 1:48
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1027 1:75 1:480 1:1029 1:4 1:1031 1:65 1:65 1:1032 1:1033 1:1034 1:1029 1:1031 1:1250
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:747 1:748 1:729 1:75 1:68 1:730 1:16 1:734
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:65 1:65 1:87 1:82 1:83 1:84 1:80
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:86 1:1039 1:85 1:168 1:81 1:4 1:78 1:48 1:22
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1032 1:1033 1:4 1:1034 1:1031 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:84 1:79 1:85 1:617 1:4 1:78 1:13 1:87 1:618 1:81
0:908 0:159 0:909 0:910 0:109 1:911 1:14 1:922 1:910 1:109 1:877
0:1335 0:409 0:1336 0:10 1:1335 1:409 1:1336 1:10
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:1074
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1137 1:100 1:895 1:80 1:81 1:13 1:86 1:82 1:83 1:84 1:6 1:138 1:48 1:22
0:908 0:159 0:909 0:910 0:109 1:908 1:14 1:1311 1:910 1:109 1:877
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:153 1:456 1:152 1:14 1:457
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:233 1:234 1:4 1:111 1:112 1:113 1:68 1:114 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1186 1:78 1:13 1:79 1:81 1:79 1:1187 1:86 1:82 1:83 1:84 1:6 1:80 1:48 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:13 1:113 1:68 1:115 1:769 1:548 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:65 1:65 1:86 1:8 1:83 1:84 1:80 1:48
0:210 0:10 0:211 0:14 0:212 1:211 1:427 1:32 1:614 1:212 1:14 1:39
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:114
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:86 1:113 1:480 1:1283 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:170 1:450 1:121 1:80 1:10 1:1428 1:13 1:1429 1:85 1:79 1:81 1:4 1:78 1:13 1:33 1:1251 1:4 1:160 1:137
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:170 1:439 1:1165 1:1166 1:13 1:133 1:85 1:94 1:168 1:80 1:81 1:4 1:78 1:48 1:22 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:422 1:223 1:224 1:67 1:96
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:22 1:148 1:112 1:113 1:68 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:587 1:170 1:80 1:10 1:774 1:10 1:13 1:57 1:51 1:86 1:85 1:94 1:168 1:81 1:4 1:78 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:951 1:442 1:4 1:111 1:13 1:112 1:113 1:480 1:114 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:772 1:84 1:144 1:85 1:168 1:80 1:81 1:4 1:78 1:48 1:22
0:210 0:10 0:211 0:14 0:212 1:210 1:10 1:211 1:14 1:212
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:1378 1:223 1:224 1:67 1:96
0:155 0:837 0:838 0:839 1:49 1:14 1:838 1:839
0:210 0:10 0:211 0:14 0:212 1:148 1:472 1:473 1:211 1:13 1:210 1:32 1:155 1:474
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:78 1:80 1:81 1:137 1:1112 1:84 1:450 1:1113 1:81 1:137 1:86 1:85 1:81 1:4 1:78 1:48 1:22
0:908 0:159 0:909 0:910 0:109 1:911 1:912 1:909 1:910 1:109
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:807 1:78 1:169 1:81 1:94 1:170 1:144 1:80 1:48 1:22
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:104 1:421 1:86 1:695 1:96 1:696 1:9
0:155 0:837 0:838 0:839 1:1052 1:205 1:155 1:838 1:839 1:70
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:734 1:22
0:210 0:10 0:211 0:14 0:212 1:211 1:65 1:65 1:14 1:212 1:65 1:65 1:14 1:1349
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:882 1:113 1:68 1:21
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1079 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:84 1:959 1:80 1:577 1:14 1:39 1:13 1:79 1:78 1:80 1:81 1:86 1:82 1:169 1:84 1:960 1:48
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:1074 1:113 1:68
0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:10 1:14 1:212 1:211
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:116 1:113 1:68 1:800 1:173
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:4 1:78 1:13 1:423 1:424 1:235 1:4 1:84 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:121 1:80 1:81 1:86 1:82 1:83 1:170 1:138 1:48 1:22
0:421 0:456 0:153 0:152 0:159 0:457 1:222 1:39 1:456 1:153 1:152 1:475 1:495 1:737 1:1076 1:102 1:1077 1:1078
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:114
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:115 1:116 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:86 1:695 1:96 1:696 1:9 1:65 1:65 1:104 1:86 1:695 1:96 1:696 1:1128
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:113 1:68 1:114 1:86 1:75 1:110
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:1227 1:456 1:152 1:14 1:457
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:82 1:118 1:170 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:86 1:8 1:1087 1:84 1:80 1:48
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1391 1:1392 1:13 1:1393 1:1032 1:1033 1:189 1:4 1:629 1:1034 1:1031 1:48
0:908 0:159 0:909 0:910 0:109 1:908 1:30 1:155 1:922 1:910 1:109 1:877 1:22
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:729 1:732 1:733 1:10 1:120 1:75 1:68 1:730 1:16 1:734
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:78 1:80 1:81 1:65 1:65 1:58 1:94 1:84 1:85 1:206 1:14 1:85 1:22
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
......@@ -40,8 +39,8 @@ class Reader(ReaderBase):
padding = 0
output = [(slot, []) for slot in self.all_slots]
for elem in elements:
feasign, slot = elem.split(':')
if not self._all_slots_dict.has_key(slot):
slot, feasign = elem.split(':')
if slot not in self._all_slots_dict:
continue
self._all_slots_dict[slot][0] = True
index = self._all_slots_dict[slot][1]
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
class Dataset:
def __init__(self):
pass
class SyntheticDataset(Dataset):
def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
self.query_slot_num = query_slot_num
self.title_slot_num = title_slot_num
self.dataset_size = dataset_size
def _reader_creator(self, is_train):
def generate_ids(num, space):
return [random.randint(0, space - 1) for i in range(num)]
def reader():
for i in range(self.dataset_size):
query_slots = []
pos_title_slots = []
neg_title_slots = []
for i in range(self.query_slot_num):
qslot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
qslot = [str(fea) + ':' + str(i) for fea in qslot]
query_slots += qslot
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [
str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
yield query_slots + pos_title_slots
return reader
def train(self):
return self._reader_creator(True)
def valid(self):
return self._reader_creator(True)
def test(self):
return self._reader_creator(False)
if __name__ == '__main__':
sparse_feature_dim = 1000001
query_slots = 1
title_slots = 1
dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train()
test_reader = dataset.test()
with open("data/train/train.txt", 'w') as fout:
for data in train_reader():
fout.write(' '.join(data))
fout.write("\n")
with open("data/test/test.txt", 'w') as fout:
for data in test_reader():
fout.write(' '.join(data))
fout.write("\n")
......@@ -43,8 +43,8 @@ class Reader(ReaderBase):
padding = 0
output = [(slot, []) for slot in self.all_slots]
for elem in elements:
feasign, slot = elem.split(':')
if not self._all_slots_dict.has_key(slot):
slot, feasign = elem.split(':')
if slot not in self._all_slots_dict:
continue
self._all_slots_dict[slot][0] = True
index = self._all_slots_dict[slot][1]
......
# multiview-simnet文本匹配模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── run.sh #运行脚本,在效果复现时使用
├── transform.py #整理格式准备计算指标的程序
├── reader.py #读者需要自定义数据集时供读者参考
├── evaluate_reader.py #读者需要自定义数据集时供读者参考
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。多视角Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。 多视角Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,随机实用一个Item作为负例进行排序学习。
模型的具体细节可以阅读论文[MultiView-Simnet](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf):
<p align="center">
<img align="center" src="../../../doc/imgs/multiview-simnet.png">
<p>
## 数据准备
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
```
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
```
数据格式为一个标识句子的slot,后跟一个句子中词的token。两者形成{slot:token}的形式标识一个词:
```
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7
PaddleRec >=0.1
os : linux
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/match/multiview-simnet/config.yaml
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/match/multiview-simnet
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm -f simnet_dataset-1.0.0.tar.gz
mv data/zhidao ./
rm -rf data
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt,label.txt和testquery.txt。将其放入train和test目录下以备训练时调用。命令如下:
```
python3 preprocess.py
rm -f ./train/train.txt
mv train.txt ./train
rm -f ./test/test.txt
mv test.txt ./test
cd ..
```
4. 退回tagspace目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动格式整理程序transform,最后计算正逆序比:
```
sh run.sh
```
运行结果大致如下:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 14:24:57.255358 7888 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 14:24:57.259166 7888 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 14:24:57.262634 7888 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 14:24:57.264791 7888 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
103
pnr: 1.17674418605
query_num: 11
pair_num: 468 468
equal_num: 0
正序率: 0.540598290598
253 215
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
## 进阶使用
## FAQ
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -14,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
echo "begin to prepare data"
mkdir -p data/train
mkdir -p data/test
python generate_synthetic_data.py
#!/bin/bash
echo "................run................."
python -m paddlerec.run -m ./config.yaml >result1.txt
grep -i "query_pt_sim" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt
rm -f result1.txt
rm -f result2.txt
python transform.py
sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
rm -f pair.txt
python ../../../tools/cal_pos_neg.py result.txt
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
label = []
filename = './data/label.txt'
f = open(filename, "r")
f.readline()
num = 0
for line in f.readlines():
num = num + 1
line = line.strip()
label.append(line)
f.close()
print(num)
filename = './result.txt'
sim = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
sim.append(float(line))
filename = './data/testquery.txt'
f = open(filename, "r")
f.readline()
query = []
for line in f.readlines():
line = line.strip()
query.append(line)
f.close()
filename = 'pair.txt'
f = open(filename, "w")
for i in range(len(sim)):
f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
f.close()
# 匹配模型库
## 简介
我们提供了常见的匹配任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的模型包括 [DSSM](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/match/dssm)[MultiView-Simnet](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/match/multiview-simnet)
我们提供了常见的匹配任务中使用的模型算法的PaddleRec实现, 单机训练&预测效果指标以及分布式训练&预测性能指标等。实现的模型包括 [DSSM](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/match/dssm)[MultiView-Simnet](http://gitlab.baidu.com/tangwei12/paddlerec/tree/develop/models/match/multiview-simnet)[match-pyramid](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/match/match-pyramid)
模型算法库在持续添加中,欢迎关注。
......@@ -18,6 +18,8 @@
| :------------------: | :--------------------: | :---------: |
| DSSM | Deep Structured Semantic Models | [CIKM 2013][Learning Deep Structured Semantic Models for Web Search using Clickthrough Data](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf) |
| MultiView-Simnet | Multi-view Simnet for Personalized recommendation | [WWW 2015][A Multi-View Deep Learning Approach for Cross Domain User Modeling in Recommendation Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf) |
| match-pyramid | Text Matching as Image Recognition | [arXiv W2016][Text Matching as Image Recognition](https://arxiv.org/pdf/1602.06359.pdf) |
下面是每个模型的简介(注:图片引用自链接中的论文)
......@@ -31,24 +33,26 @@
<img align="center" src="../../doc/imgs/multiview-simnet.png">
<p>
## 使用教程(快速开始)
### 训练
```shell
git clone https://github.com/PaddlePaddle/PaddleRec.git paddle-rec
cd paddle-rec
[match-pyramid](https://arxiv.org/pdf/1602.06359.pdf):
<p align="center">
<img align="center" src="../../doc/imgs/match-pyramid.png">
<p>
## 使用教程(快速开始)
### 训练&预测
每个模型都提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/match/dssm/config.yaml # dssm
python -m paddlerec.run -m models/match/multiview-simnet/config.yaml # multiview-simnet
python -m paddlerec.run -m models/contentunderstanding/match-pyramid/config.yaml #match-pyramid
```
### 效果复现
每个模型下的readme中都有详细的效果复现的教程,您可以进入模型的目录中详细查看
### 预测
```shell
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: infer
# 修改phase阶段为infer的配置,参照config注释
### 模型效果 (测试)
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml # 以dssm为例
```
| 数据集 | 模型 | 正逆序比 | map |
| :------------------: | :--------------------: | :---------: |:---------: |
| zhidao | DSSM | 2.25 | -- |
| Letor07 | match-pyramid | -- | 0.42 |
| zhidao | multiview-simnet | 1.72 | -- |
# MMOE
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├── train_data.txt
├── test #测试数据
├── test_data.txt
├── run.sh
├── data_preparation.py
├── __init__.py
├── config.yaml #配置文件
├── census_reader.py #数据读取文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#FAQ)
## 模型简介
多任务模型通过学习不同任务的联系和差异,可提高每个任务的学习效率和质量。多任务学习的的框架广泛采用shared-bottom的结构,不同任务间共用底部的隐层。这种结构本质上可以减少过拟合的风险,但是效果上可能受到任务差异和数据分布带来的影响。 论文[《Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts》]( https://www.kdd.org/kdd2018/accepted-papers/view/modeling-task-relationships-in-multi-task-learning-with-multi-gate-mixture- )中提出了一个Multi-gate Mixture-of-Experts(MMOE)的多任务学习结构。MMOE模型刻画了任务相关性,基于共享表示来学习特定任务的函数,避免了明显增加参数的缺点。
我们在Paddlepaddle定义MMOE的网络结构,在开源数据集Census-income Data上验证模型效果,两个任务的auc分别为:
1.income
> max_mmoe_test_auc_income:0.94937
>
> mean_mmoe_test_auc_income:0.94465
2.marital
> max_mmoe_test_auc_marital:0.99419
>
> mean_mmoe_test_auc_marital:0.99324
若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)部分。
本项目支持功能
# MMOE
以下是本例的简要目录结构及说明:
```
├── data # 文档
├── train #训练数据
├── train_data.txt
├── test #测试数据
├── test_data.txt
├── run.sh
├── data_preparation.py
├── __init__.py
├── config.yaml #配置文件
├── census_reader.py #数据读取文件
├── model.py #模型文件
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#模型简介)
- [数据准备](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#数据准备)
- [运行环境](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#运行环境)
- [快速开始](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#快速开始)
- [论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)
- [进阶使用](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#进阶使用)
- [FAQ](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#FAQ)
## 模型简介
多任务模型通过学习不同任务的联系和差异,可提高每个任务的学习效率和质量。多任务学习的的框架广泛采用shared-bottom的结构,不同任务间共用底部的隐层。这种结构本质上可以减少过拟合的风险,但是效果上可能受到任务差异和数据分布带来的影响。 论文[《Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts》]( https://www.kdd.org/kdd2018/accepted-papers/view/modeling-task-relationships-in-multi-task-learning-with-multi-gate-mixture- )中提出了一个Multi-gate Mixture-of-Experts(MMOE)的多任务学习结构。MMOE模型刻画了任务相关性,基于共享表示来学习特定任务的函数,避免了明显增加参数的缺点。
我们在Paddlepaddle定义MMOE的网络结构,在开源数据集Census-income Data上验证模型效果。
若进行精度验证,请参考[论文复现](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/multitask/mmoe#论文复现)部分。
本项目支持功能
训练:单机CPU、单机单卡GPU、单机多卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU ;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据准备
数据地址: [Census-income Data](https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz )
数据地址: [Census-income Data](https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz )
生成的格式以逗号为分割点
```
0,0,73,0,0,0,0,1700.09,0,0
```
完整的大数据参考论文复现部分。
数据解压后, 在run.sh脚本文件中添加文件的路径,并运行脚本。
## 运行环境
```sh
mkdir train_data
mkdir test_data
mkdir data
train_path="data/census-income.data"
test_path="data/census-income.test"
train_data_path="train_data/"
test_data_path="test_data/"
pip install -r requirements.txt
wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census.tar.gz
tar -zxvf data/census.tar.gz -C data/
PaddlePaddle>=1.7.2
python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \
--train_data_path ${train_data_path}\
--test_data_path ${test_data_path}
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
```
生成的格式以逗号为分割点
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
0,0,73,0,0,0,0,1700.09,0,0
- name: infer_runner
class: infer
init_model_path: "increment/0"
device: cpu
```
## 论文复现
数据下载,我们提供了在百度云上预处理好的数据,可以直接训练
```
wget https://paddlerec.bj.bcebos.com/mmoe/train_data.csv
wget https://paddlerec.bj.bcebos.com/mmoe/test_data.csv
wget https://paddlerec.bj.bcebos.com/mmoe/config_all.yaml
```
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=32 gpu配置等,可参考config_all.yaml
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
dataset:
- name: dataset_train
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
```
### 单机预测
CPU环境
在config.yaml文件中设置好epochs、device等参数。
```
- name: infer_runner
class: infer
init_model_path: "increment/0"
device: cpu
```
## 论文复现
用原论文的完整数据复现论文效果需要在config.yaml中修改batch_size=1000, thread_num=8, epoch_num=4
使用gpu p100 单卡训练 6.5h 测试auc: best:0.9940, mean:0.9932
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
```
python -m paddlerec.run -m /home/your/dir/config_all.yaml #调试模式 直接指定本地config的绝对路径
```
## 进阶使用
## FAQ
......@@ -16,12 +16,12 @@ workspace: "models/multitask/mmoe"
dataset:
- name: dataset_train
batch_size: 1
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
- name: dataset_infer
batch_size: 1
batch_size: 5
type: QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/census_reader.py"
......@@ -38,15 +38,14 @@ hyper_parameters:
strategy: async
#use infer_runner mode and modify 'phase' below if infer
mode: train_runner
#mode: infer_runner
mode: [train_runner, infer_runner]
runner:
- name: train_runner
class: train
device: cpu
epochs: 3
save_checkpoint_interval: 2
save_checkpoint_interval: 1
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
......@@ -61,7 +60,7 @@ phase:
model: "{workspace}/model.py"
dataset_name: dataset_train
thread_num: 1
#- name: infer
# model: "{workspace}/model.py"
# dataset_name: dataset_infer
# thread_num: 1
- name: infer
model: "{workspace}/model.py"
dataset_name: dataset_infer
thread_num: 1
# Skip-Gram W2V
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── convert_sample.txt
├── test
├── sample.txt
├── dict
├── word_count_dict.txt
├── word_id_dict.txt
├── preprocess.py # 数据预处理文件
├── __init__.py
├── README.md # 文档
├── model.py #模型文件
├── config.yaml #配置文件
├── data_prepare.sh #一键数据处理脚本
├── w2v_reader.py #训练数据reader
├── w2v_evaluate_reader.py # 预测数据reader
├── infer.py # 自定义预测脚本
├── utils.py # 自定义预测中用到的reader等工具
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
---
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [论文复现](#论文复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
本例实现了skip-gram模式的word2vector模型,如下图所示:
<p align="center">
<img align="center" src="../../../doc/imgs/word2vec.png">
<p>
以每一个词为中心词X,然后在窗口内和临近的词Y组成样本对(X,Y)用于网络训练。在实际训练过程中还会根据自定义的负采样率生成负样本来加强训练的效果
具体的训练思路如下:
<p align="center">
<img align="center" src="../../../doc/imgs/w2v_train.png">
<p>
推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124377)教程获取更详细的信息。
本模型配置默认使用demo数据集,若进行精度验证,请参考[论文复现](#论文复现)部分。
本项目支持功能
训练:单机CPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据处理
为和样例数据路径区分,全量训练数据、测试数据、词表文件会依次保存在data/all_train, data/all_test, data/all_dict文件夹中。
```
mkdir -p data/all_dict
mkdir -p data/all_train
mkdir -p data/all_test
```
本示例中全量数据处理共包含三步:
- Step1: 数据下载。
```
# 全量训练集
mkdir raw_data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# 测试集
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir/* data/all_test/
```
- Step2: 训练据预处理。包含三步,第一步,根据英文语料生成词典,中文语料可以通过修改text_strip方法自定义处理方法。
```
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt
```
得到的词典格式为词<空格>词频,低频词用'UNK'表示,如下所示:
```
the 1061396
of 593677
and 416629
one 411764
in 372201
a 325873
<UNK> 324608
to 316376
zero 264975
nine 250430
```
第二步,根据词典将文本转成id, 同时进行downsample,按照概率过滤常见词, 同时生成word和id映射的文件,文件名为词典+"word_to_id"。
```
python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
```
第三步,为更好地利用多线程进行训练加速,我们需要将训练文件分成多个子文件,默认拆分成1024个文件。
```
python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/all_train
```
- Step3: 路径整理。
```
mv raw_data/word_count_dict.txt data/all_dict/
mv raw_data/word_count_dict.txt_word_to_id_ data/all_dict/word_id_dict.txt
rm -rf raw_data
```
方便起见, 我们提供了一键式数据处理脚本:
```
sh data_prepare.sh
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
CPU环境
在config.yaml文件中设置好设备,epochs等。
```
# select runner by name
mode: [single_cpu_train, single_cpu_infer]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: single_cpu_train
class: train
# num of epochs
epochs: 5
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment_w2v" # save checkpoint path
save_inference_path: "inference_w2v" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 1
phases: [phase1]
```
### 单机预测
我们通过词类比(Word Analogy)任务来检验word2vec模型的训练效果。输入四个词A,B,C,D,假设存在一种关系relation, 使得relation(A, B) = relation(C, D),然后通过A,B,C去预测D,emb(D) = emb(B) - emb(A) + emb(C)。
CPU环境
PaddleRec预测配置:
在config.yaml文件中设置好epochs、device等参数。
```
- name: single_cpu_infer
class: infer
# device to run training or infer
device: cpu
init_model_path: "increment_w2v" # load model path
print_interval: 1
phases: [phase2]
```
为复现论文效果,我们提供了一个自定义预测脚本,在自定义预测中,我们会跳过预测结果是输入A,B,C的情况,然后计算预测准确率。执行命令如下:
```
python infer.py --test_dir ./data/test --dict_path ./data/dict/word_id_dict.txt --batch_size 20000 --model_dir ./increment_w2v/ --start_index 0 --last_index 5 --emb_size 300
```
### 运行
```
python -m paddlerec.run -m paddlerec.models.recall.word2vec
```
### 结果展示
样例数据训练结果展示:
```
Running SingleStartup.
Running SingleRunner.
W0813 11:36:16.129736 43843 build_strategy.cc:170] fusion_group is not enabled for Windows/MacOS now, and only effective when running with CUDA GPU.
batch: 1, LOSS: [3.618 3.684 3.698 3.653 3.736]
batch: 2, LOSS: [3.394 3.453 3.605 3.487 3.553]
batch: 3, LOSS: [3.411 3.402 3.444 3.387 3.357]
batch: 4, LOSS: [3.557 3.196 3.304 3.209 3.299]
batch: 5, LOSS: [3.217 3.141 3.168 3.114 3.315]
batch: 6, LOSS: [3.342 3.219 3.124 3.207 3.282]
batch: 7, LOSS: [3.19 3.207 3.136 3.322 3.164]
epoch 0 done, use time: 0.119026899338, global metrics: LOSS=[3.19 3.207 3.136 3.322 3.164]
...
epoch 4 done, use time: 0.097608089447, global metrics: LOSS=[2.734 2.66 2.763 2.804 2.809]
```
样例数据预测结果展示:
```
Running SingleInferStartup.
Running SingleInferRunner.
load persistables from increment_w2v/4
batch: 1, acc: [1.]
batch: 2, acc: [1.]
batch: 3, acc: [1.]
Infer phase2 of epoch 4 done, use time: 4.89376211166, global metrics: acc=[1.]
...
Infer phase2 of epoch 3 done, use time: 4.43099021912, global metrics: acc=[1.]
```
## 论文复现
1. 用原论文的完整数据复现论文效果需要在config.yaml修改超参:
- name: dataset_train
batch_size: 100 # 1. 修改batch_size为100
type: DataLoader
data_path: "{workspace}/data/all_train" # 2. 修改数据为全量训练数据
word_count_dict_path: "{workspace}/data/all_dict/ word_count_dict.txt" # 3. 修改词表为全量词表
data_converter: "{workspace}/w2v_reader.py"
- name: single_cpu_train
- epochs: # 4. 修改config.yaml中runner的epochs为5。
修改后运行方案:修改config.yaml中的'workspace'为config.yaml的目录位置,执行
```
python -m paddlerec.run -m /home/your/dir/config.yaml #调试模式 直接指定本地config的绝对路径
```
2. 使用自定义预测程序预测全量测试集:
```
python infer.py --test_dir ./data/all_test --dict_path ./data/all_dict/word_id_dict.txt --batch_size 20000 --model_dir ./increment_w2v/ --start_index 0 --last_index 5 --emb_size 300
```
结论:使用cpu训练5轮,自定义预测准确率为0.540,每轮训练时间7小时左右。
## 进阶使用
## FAQ
......@@ -22,7 +22,7 @@ dataset:
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
data_converter: "{workspace}/w2v_reader.py"
- name: dataset_infer # name
batch_size: 50
batch_size: 2000
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
......@@ -42,38 +42,40 @@ hyper_parameters:
window_size: 5
# select runner by name
mode: train_runner
mode: [single_cpu_train, single_cpu_infer]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
- name: single_cpu_train
class: train
# num of epochs
epochs: 2
epochs: 5
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_checkpoint_path: "increment_w2v" # save checkpoint path
save_inference_path: "inference_w2v" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
print_interval: 1
- name: infer_runner
print_interval: 1000
phases: [phase1]
- name: single_cpu_infer
class: infer
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
init_model_path: "increment_w2v" # load model path
print_interval: 1
phases: [phase2]
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 5
- name: phase2
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_infer # select dataset by name
thread_num: 1
# - name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
......@@ -14,6 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
mkdir -p data/all_dict
mkdir -p data/all_train
mkdir -p data/all_test
# download train_data
mkdir raw_data
......@@ -21,18 +24,16 @@ wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-w
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# download test data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir/* data/all_test/
# preprocess data
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt
python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
mv raw_data/word_count_dict.txt data/dict/
mv raw_data/word_id_dict.txt data/dict/
python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/all_train
rm -rf data/train/*
rm -rf data/test/*
python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
# download test data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir/* data/test/
mv raw_data/word_count_dict.txt data/all_dict/
mv raw_data/word_count_dict.txt_word_to_id_ data/all_dict/word_id_dict.txt
rm -rf raw_data
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import sys
import time
import math
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import utils
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def parse_args():
parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example")
parser.add_argument(
'--dict_path',
type=str,
default='./data/data_c/1-billion_dict_word_to_id_',
help="The path of dic")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--print_step', type=int, default='500000', help='print step')
parser.add_argument(
'--start_index', type=int, default='0', help='start index')
parser.add_argument(
'--last_index', type=int, default='100', help='last index')
parser.add_argument(
'--model_dir', type=str, default='model', help='model dir')
parser.add_argument(
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
parser.add_argument(
'--emb_size', type=int, default='64', help='batch_size')
args = parser.parse_args()
return args
def infer_network(vocab_size, emb_size):
analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64')
all_label = fluid.data(name="all_label", shape=[vocab_size], dtype='int64')
emb_all_label = fluid.embedding(
input=all_label, size=[vocab_size, emb_size], param_attr="emb")
emb_a = fluid.embedding(
input=analogy_a, size=[vocab_size, emb_size], param_attr="emb")
emb_b = fluid.embedding(
input=analogy_b, size=[vocab_size, emb_size], param_attr="emb")
emb_c = fluid.embedding(
input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
dist = fluid.layers.matmul(x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
return values, pred_idx
def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
emb_size = args.emb_size
batch_size = args.batch_size
with fluid.scope_guard(fluid.Scope()):
main_program = fluid.Program()
with fluid.program_guard(main_program):
values, pred = infer_network(vocab_size, emb_size)
for epoch in range(start_index, last_index + 1):
copy_program = main_program.clone()
model_path = model_dir + "/" + str(epoch)
fluid.io.load_persistables(
exe, model_path, main_program=copy_program)
accum_num = 0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
b_size = len([dat[0] for dat in data])
wa = np.array([dat[0] for dat in data]).astype(
"int64").reshape(b_size)
wb = np.array([dat[1] for dat in data]).astype(
"int64").reshape(b_size)
wc = np.array([dat[2] for dat in data]).astype(
"int64").reshape(b_size)
label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data]
para = exe.run(copy_program,
feed={
"analogy_a": wa,
"analogy_b": wb,
"analogy_c": wc,
"all_label": np.arange(vocab_size)
.reshape(vocab_size).astype("int64"),
},
fetch_list=[pred.name, values],
return_numpy=False)
pre = np.array(para[0])
val = np.array(para[1])
for ii in range(len(label)):
top4 = pre[ii]
accum_num_sum += 1
for idx in top4:
if int(idx) in input_word[ii]:
continue
if int(idx) == int(label[ii][0]):
accum_num += 1
break
if step_id % 1 == 0:
print("step:%d %d " % (step_id, accum_num))
print("epoch:%d \t acc:%.3f " %
(epoch, 1.0 * accum_num / accum_num_sum))
if __name__ == "__main__":
args = parse_args()
start_index = args.start_index
last_index = args.last_index
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
dict_path = args.dict_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:", last_index)
vocab_size, test_reader, id2word = utils.prepare_data(
test_dir, dict_path, batch_size=batch_size)
print("vocab_size:", vocab_size)
infer_epoch(
args,
vocab_size,
test_reader=test_reader,
use_cuda=use_cuda,
i2w=id2word)
......@@ -209,10 +209,10 @@ class Model(ModelBase):
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
values, pred_idx = fluid.layers.topk(input=dist, k=1)
label = fluid.layers.expand(
fluid.layers.unsqueeze(
inputs[3], axes=[1]), expand_times=[1, 4])
inputs[3], axes=[1]), expand_times=[1, 1])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
......
......@@ -228,7 +228,7 @@ def data_split(args):
contents.extend(f.readlines())
num = int(args.file_nums)
lines_per_file = len(contents) / num
lines_per_file = int(math.ceil(len(contents) / float(num)))
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import collections
import six
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import os
import preprocess
import io
def BuildWord_IdMap(dict_path):
word_to_id = dict()
id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
return word_to_id, id_to_word
def prepare_data(file_dir, dict_path, batch_size):
w2i, i2w = BuildWord_IdMap(dict_path)
vocab_size = len(i2w)
reader = fluid.io.batch(test(file_dir, w2i), batch_size)
return vocab_size, reader, i2w
def check_version(with_shuffle_batch=False):
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
if with_shuffle_batch:
fluid.require_version('1.7.0')
else:
fluid.require_version('1.6.0')
except Exception as e:
logger.error(err)
sys.exit(1)
def native_to_unicode(s):
if _is_unicode(s):
return s
try:
return _to_unicode(s)
except UnicodeDecodeError:
res = _to_unicode(s, ignore_errors=True)
return res
def _is_unicode(s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def _to_unicode(s, ignore_errors=False):
if _is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def strip_lines(line, vocab):
return _replace_oov(vocab, native_to_unicode(line))
def _replace_oov(original_vocab, line):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
Args:
original_vocab: a set of strings (The standard vocabulary for the dataset)
line: a unicode string - a space-delimited sequence of words.
Returns:
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
word if word in original_vocab else u"<UNK>" for word in line.split()
])
def reader_creator(file_dir, word_to_id):
def reader():
files = os.listdir(file_dir)
for fi in files:
with io.open(
os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for line in f:
if ':' in line:
pass
else:
line = strip_lines(line.lower(), word_to_id)
line = line.split()
yield [word_to_id[line[0]]], [word_to_id[line[1]]], [
word_to_id[line[2]]
], [word_to_id[line[3]]], [
word_to_id[line[0]], word_to_id[line[1]],
word_to_id[line[2]]
]
return reader
def test(test_dir, w2i):
return reader_creator(test_dir, w2i)
......@@ -76,7 +76,7 @@ class Reader(ReaderBase):
def generate_sample(self, line):
def reader():
if ':' in line:
pass
return
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
yield [('analogy_a', [self.word_to_id[features[0]]]),
......
......@@ -15,6 +15,7 @@
import io
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.reader import ReaderBase
from paddlerec.core.utils import envs
......@@ -47,6 +48,10 @@ class Reader(ReaderBase):
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.batch_size = envs.get_global_env(
"dataset.dataset_train.batch_size")
self.is_dataloader = envs.get_global_env(
"dataset.dataset_train.type") == "DataLoader"
self.cs = None
if not self.with_shuffle_batch:
......@@ -88,11 +93,46 @@ class Reader(ReaderBase):
for context_id in context_word_ids:
output = [('input_word', [int(target_id)]),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
if self.with_shuffle_batch or self.is_dataloader:
yield output
else:
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output
yield output
return reader
def batch_tensor_creator(self, sample_reader):
def __reader__():
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
if len(result[0]) == self.batch_size:
tensor_result = []
for tensor in result:
t = fluid.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
elif len(dat.shape) == 1:
dat = dat.reshape((-1, 1))
t.set(dat, fluid.CPUPlace())
tensor_result.append(t)
if self.with_shuffle_batch:
yield tensor_result
else:
tt = fluid.Tensor()
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
neg_array = np.tile(neg_array, self.batch_size)
tt.set(
neg_array.reshape((self.batch_size, self.neg_num)),
fluid.CPUPlace())
tensor_result.append(tt)
yield tensor_result
result = [[], []]
return __reader__
# Youtebe-DNN
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── data.txt
├── test
├── data.txt
├── generate_ramdom_data # 随机训练数据生成文件
├── __init__.py
├── README.md # 文档
├── model.py #模型文件
├── config.yaml #配置文件
├── data_prepare.sh #一键数据处理脚本
├── reader.py #reader
├── infer.py # 预测程序
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
---
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [论文复现](#论文复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
[《Deep Neural Networks for YouTube Recommendations》](https://link.zhihu.com/?target=https%3A//static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf) 这篇论文是google的YouTube团队在推荐系统上DNN方面的尝试,是经典的向量化召回模型,主要通过模型来学习用户和物品的兴趣向量,并通过内积来计算用户和物品之间的相似性,从而得到最终的候选集。YouTube采取了两层深度网络完成整个推荐过程:
1.第一层是**Candidate Generation Model**完成候选视频的快速筛选,这一步候选视频集合由百万降低到了百的量级。
2.第二层是用**Ranking Model**完成几百个候选视频的精排。
本项目在paddlepaddle上完成YouTube dnn的召回部分Candidate Generation Model,分别获得用户和物品的向量表示,从而后续可以通过其他方法(如用户和物品的余弦相似度)给用户推荐物品。
由于原论文没有开源数据集,本项目随机构造数据验证网络的正确性。
本项目支持功能
训练:单机CPU、单机单卡GPU、本地模拟参数服务器训练、增量训练,配置请参考 [启动训练](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/train.md)
预测:单机CPU、单机单卡GPU;配置请参考[PaddleRec 离线预测](https://github.com/PaddlePaddle/PaddleRec/blob/master/doc/predict.md)
## 数据处理
调用python generate_ramdom_data.py生成随机训练数据,每行数据格式如下:
```
#watch_vec;search_vec;other_feat;label
0.01,0.02,...,0.09;0.01,0.02,...,0.09;0.01,0.02,...,0.09;20
```
方便起见,我们提供了一键式数据生成脚本:
```
sh data_prepare.sh
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7/3.5/3.6/3.7
PaddleRec >=0.1
os : windows/linux/macos
## 快速开始
### 单机训练
```
mode: [cpu_single_train]
runner:
- name: cpu_single_train
class: train
device: cpu # if use_gpu, set it to gpu
epochs: 20
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment_youtubednn"
save_inference_path: "inference_youtubednn"
save_inference_feed_varnames: ["watch_vec", "search_vec", "other_feat"] # feed vars of save inference
save_inference_fetch_varnames: ["l3.tmp_2"]
print_interval: 1
```
### 单机预测
通过计算每个用户和每个物品的余弦相似度,给每个用户推荐topk视频:
cpu infer:
```
python infer.py --test_epoch 19 --inference_model_dir ./inference_youtubednn --increment_model_dir ./increment_youtubednn --watch_vec_size 64 --search_vec_size 64 --other_feat_size 64 --topk 5
```
gpu infer:
```
python infer.py --use_gpu 1 --test_epoch 19 --inference_model_dir ./inference_youtubednn --increment_model_dir ./increment_youtubednn --watch_vec_size 64 --search_vec_size 64 --other_feat_size 64 --topk 5
```
### 运行
```
python -m paddlerec.run -m paddlerec.models.recall.w2v
```
### 结果展示
样例数据训练结果展示:
```
Running SingleStartup.
Running SingleRunner.
batch: 1, acc: [0.03125]
batch: 2, acc: [0.0625]
batch: 3, acc: [0.]
...
epoch 0 done, use time: 0.0605320930481, global metrics: acc=[0.]
...
epoch 19 done, use time: 0.33447098732, global metrics: acc=[0.]
```
样例数据预测结果展示:
```
user:0, top K videos:[40, 31, 4, 33, 93]
user:1, top K videos:[35, 57, 58, 40, 17]
user:2, top K videos:[35, 17, 88, 40, 9]
user:3, top K videos:[73, 35, 39, 58, 38]
user:4, top K videos:[40, 31, 57, 4, 73]
user:5, top K videos:[38, 9, 7, 88, 22]
user:6, top K videos:[35, 73, 14, 58, 28]
user:7, top K videos:[35, 73, 58, 38, 56]
user:8, top K videos:[38, 40, 9, 35, 99]
user:9, top K videos:[88, 73, 9, 35, 28]
user:10, top K videos:[35, 52, 28, 54, 73]
```
## 进阶使用
## FAQ
......@@ -17,11 +17,10 @@ workspace: "models/recall/youtube_dnn"
dataset:
- name: dataset_train
batch_size: 5
type: DataLoader
#type: QueueDataset
batch_size: 32
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
data_converter: "{workspace}/random_reader.py"
data_converter: "{workspace}/reader.py"
hyper_parameters:
watch_vec_size: 64
......@@ -30,22 +29,23 @@ hyper_parameters:
output_size: 100
layers: [128, 64, 32]
optimizer:
class: adam
learning_rate: 0.001
strategy: async
class: SGD
learning_rate: 0.01
mode: train_runner
mode: [cpu_single_train]
runner:
- name: train_runner
- name: cpu_single_train
class: train
device: cpu
epochs: 3
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
print_interval: 10
epochs: 20
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment_youtubednn"
save_inference_path: "inference_youtubednn"
save_inference_feed_varnames: ["watch_vec", "search_vec", "other_feat"] # feed vars of save inference
save_inference_fetch_varnames: ["l3.tmp_2"]
print_interval: 1
phase:
- name: train
......
此差异已折叠。
4764,174,1
4764,2958,0
4764,452,0
4764,1946,0
4764,3208,0
2044,2237,1
2044,1998,0
2044,328,0
2044,1542,0
2044,1932,0
4276,65,1
4276,3247,0
4276,942,0
4276,3666,0
4276,2222,0
3933,682,1
3933,2451,0
3933,3695,0
3933,1643,0
3933,3568,0
1151,1265,1
1151,118,0
1151,2532,0
1151,2083,0
1151,2350,0
1757,876,1
1757,201,0
1757,3633,0
1757,1068,0
1757,2549,0
3370,276,1
3370,2435,0
3370,606,0
3370,910,0
3370,2146,0
5137,1018,1
5137,2163,0
5137,3167,0
5137,2315,0
5137,3595,0
3933,2831,1
3933,2881,0
3933,2949,0
3933,3660,0
3933,417,0
3102,999,1
3102,1902,0
3102,2161,0
3102,3042,0
3102,1113,0
2022,336,1
2022,1672,0
2022,2656,0
2022,3649,0
2022,883,0
2664,655,1
2664,3660,0
2664,1711,0
2664,3386,0
2664,1668,0
25,701,1
25,32,0
25,2482,0
25,3177,0
25,2767,0
1738,1643,1
1738,2187,0
1738,228,0
1738,650,0
1738,3101,0
5411,1241,1
5411,2546,0
5411,3019,0
5411,3618,0
5411,1674,0
638,579,1
638,3512,0
638,783,0
638,2111,0
638,1880,0
3554,200,1
3554,2893,0
3554,2428,0
3554,969,0
3554,2741,0
4283,1074,1
4283,3056,0
4283,2032,0
4283,405,0
4283,1505,0
5111,200,1
5111,3488,0
5111,477,0
5111,2790,0
5111,40,0
3964,515,1
3964,1528,0
3964,2173,0
3964,1701,0
3964,2832,0
此差异已折叠。
4764,174,1
4764,2958,0
4764,452,0
4764,1946,0
4764,3208,0
2044,2237,1
2044,1998,0
2044,328,0
2044,1542,0
2044,1932,0
4276,65,1
4276,3247,0
4276,942,0
4276,3666,0
4276,2222,0
3933,682,1
3933,2451,0
3933,3695,0
3933,1643,0
3933,3568,0
1151,1265,1
1151,118,0
1151,2532,0
1151,2083,0
1151,2350,0
1757,876,1
1757,201,0
1757,3633,0
1757,1068,0
1757,2549,0
3370,276,1
3370,2435,0
3370,606,0
3370,910,0
3370,2146,0
5137,1018,1
5137,2163,0
5137,3167,0
5137,2315,0
5137,3595,0
3933,2831,1
3933,2881,0
3933,2949,0
3933,3660,0
3933,417,0
3102,999,1
3102,1902,0
3102,2161,0
3102,3042,0
3102,1113,0
2022,336,1
2022,1672,0
2022,2656,0
2022,3649,0
2022,883,0
2664,655,1
2664,3660,0
2664,1711,0
2664,3386,0
2664,1668,0
25,701,1
25,32,0
25,2482,0
25,3177,0
25,2767,0
1738,1643,1
1738,2187,0
1738,228,0
1738,650,0
1738,3101,0
5411,1241,1
5411,2546,0
5411,3019,0
5411,3618,0
5411,1674,0
638,579,1
638,3512,0
638,783,0
638,2111,0
638,1880,0
3554,200,1
3554,2893,0
3554,2428,0
3554,969,0
3554,2741,0
4283,1074,1
4283,3056,0
4283,2032,0
4283,405,0
4283,1505,0
5111,200,1
5111,3488,0
5111,477,0
5111,2790,0
5111,40,0
3964,515,1
3964,1528,0
3964,2173,0
3964,1701,0
3964,2832,0
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import numpy as np
# Build a random data set.
sample_size = 100
batch_size = 32
watch_vec_size = 64
search_vec_size = 64
other_feat_size = 64
output_size = 100
watch_vecs = np.random.rand(batch_size * sample_size, watch_vec_size).tolist()
search_vecs = np.random.rand(batch_size * sample_size,
search_vec_size).tolist()
other_vecs = np.random.rand(batch_size * sample_size, other_feat_size).tolist()
labels = np.random.randint(
output_size, size=(batch_size * sample_size)).tolist()
output_path = "./data/train/data.txt"
with open(output_path, 'w') as fout:
for i in range(batch_size * sample_size):
_str_ = ','.join(map(str, watch_vecs[i])) + ";" + ','.join(
map(str, search_vecs[i])) + ";" + ','.join(
map(str, other_vecs[i])) + ";" + str(labels[i])
fout.write(_str_)
fout.write("\n")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import copy
import numpy as np
import argparse
import paddle.fluid as fluid
import pandas as pd
from paddle.fluid.incubate.fleet.utils import utils
def parse_args():
parser = argparse.ArgumentParser("PaddlePaddle Youtube DNN infer example")
parser.add_argument(
'--use_gpu', type=int, default='0', help='whether use gpu')
parser.add_argument(
"--batch_size", type=int, default=32, help="batch_size")
parser.add_argument(
"--test_epoch", type=int, default=19, help="test_epoch")
parser.add_argument(
'--inference_model_dir',
type=str,
default='./inference_youtubednn',
help='inference_model_dir')
parser.add_argument(
'--increment_model_dir',
type=str,
default='./increment_youtubednn',
help='persistable_model_dir')
parser.add_argument(
'--watch_vec_size', type=int, default=64, help='watch_vec_size')
parser.add_argument(
'--search_vec_size', type=int, default=64, help='search_vec_size')
parser.add_argument(
'--other_feat_size', type=int, default=64, help='other_feat_size')
parser.add_argument('--topk', type=int, default=5, help='topk')
args = parser.parse_args()
return args
def infer(args):
video_save_path = os.path.join(args.increment_model_dir,
str(args.test_epoch), "l4_weight")
video_vec, = utils.load_var("l4_weight", [32, 100], 'float32',
video_save_path)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
cur_model_path = os.path.join(args.inference_model_dir,
str(args.test_epoch))
user_vec = None
with fluid.scope_guard(fluid.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
cur_model_path, exe)
# Build a random data set.
sample_size = 100
watch_vecs = []
search_vecs = []
other_feats = []
for i in range(sample_size):
watch_vec = np.random.rand(1, args.watch_vec_size)
search_vec = np.random.rand(1, args.search_vec_size)
other_feat = np.random.rand(1, args.other_feat_size)
watch_vecs.append(watch_vec)
search_vecs.append(search_vec)
other_feats.append(other_feat)
for i in range(sample_size):
l3 = exe.run(infer_program,
feed={
"watch_vec": watch_vecs[i].astype('float32'),
"search_vec": search_vecs[i].astype('float32'),
"other_feat": other_feats[i].astype('float32'),
},
return_numpy=True,
fetch_list=fetch_vars)
if user_vec is not None:
user_vec = np.concatenate([user_vec, l3[0]], axis=0)
else:
user_vec = l3[0]
# get topk result
user_video_sim_list = []
for i in range(user_vec.shape[0]):
for j in range(video_vec.shape[1]):
user_video_sim = cos_sim(user_vec[i], video_vec[:, j])
user_video_sim_list.append(user_video_sim)
tmp_list = copy.deepcopy(user_video_sim_list)
tmp_list.sort()
max_sim_index = [
user_video_sim_list.index(one)
for one in tmp_list[::-1][:args.topk]
]
print("user:{0}, top K videos:{1}".format(i, max_sim_index))
user_video_sim_list = []
def cos_sim(vector_a, vector_b):
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
cos = num / (denom + 1e-4)
sim = 0.5 + 0.5 * cos
return sim
if __name__ == "__main__":
args = parse_args()
infer(args)
......@@ -39,13 +39,17 @@ class Reader(ReaderBase):
"""
This function needs to be implemented by the user, based on data format
"""
features = line.rstrip().split(";")
watch_vec = features[0].split(',')
search_vec = features[1].split(',')
other_feat = features[2].split(',')
label = features[3]
assert (len(watch_vec) == self.watch_vec_size)
assert (len(search_vec) == self.search_vec_size)
assert (len(other_feat) == self.other_feat_size)
feature_name = ["watch_vec", "search_vec", "other_feat", "label"]
yield list(
zip(feature_name, [
np.random.rand(self.watch_vec_size).tolist()
] + [np.random.rand(self.search_vec_size).tolist()] + [
np.random.rand(self.other_feat_size).tolist()
] + [[np.random.randint(self.output_size)]]))
zip(feature_name, [watch_vec] + [search_vec] + [other_feat] +
[label]))
return reader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/python
#-*- coding:utf-8 -*-
"""
docstring
"""
import os
import sys
if len(sys.argv) < 2:
print "usage:python %s input" % (sys.argv[0])
sys.exit(-1)
fin = file(sys.argv[1])
pos_num = 0
neg_num = 0
score_list = []
label_list = []
last_query = "-1"
#0 12.786960 1
#0 -1.480890 0
cnt = 0
query_num = 0
pair_num = 0
equal_num = 0
for line in fin:
cols = line.strip().split("\t")
cnt += 1
if cnt % 500000 == 0:
print "cnt:", cnt, 1.0 * pos_num / neg_num
if len(cols) != 3:
continue
cur_query = cols[0]
if cur_query != last_query:
query_num += 1
for i in xrange(0, len(score_list)):
for j in xrange(i + 1, len(score_list)):
if label_list[i] == label_list[j]:
continue
pair_num += 1
if (score_list[i] - score_list[j]) * (
label_list[i] - label_list[j]) < 0:
neg_num += 1
elif (score_list[i] - score_list[j]) * (
label_list[i] - label_list[j]) > 0:
pos_num += 1
else:
equal_num += 1
score_list = []
label_list = []
last_query = cur_query
label = int(cols[2])
score_list.append(round(float(cols[1]), 6))
label_list.append(int(cols[2]))
fin.close()
for i in xrange(0, len(score_list)):
for j in xrange(i + 1, len(score_list)):
if label_list[i] == label_list[j]:
continue
pair_num += 1
if (score_list[i] - score_list[j]) * (label_list[i] - label_list[j]
) < 0:
neg_num += 1
elif (score_list[i] - score_list[j]) * (label_list[i] - label_list[j]
) > 0:
pos_num += 1
else:
equal_num += 1
if neg_num > 0:
print "pnr:", 1.0 * pos_num / neg_num
print "query_num:", query_num
print "pair_num:", pos_num + neg_num + equal_num, pair_num
print "equal_num:", equal_num
print "正序率:", 1.0 * pos_num / (pos_num + neg_num)
print pos_num, neg_num
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册