preprocess.py 2.9 KB
Newer Older
Y
yinhaofeng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#encoding=utf-8

import os
import sys
import numpy as np
import random

f = open("./zhidao", "r")
lines = f.readlines()
f.close()

#建立字典
word_dict = {}
for line in lines:
    line = line.strip().split("\t")
    text = line[0].split(" ") + line[1].split(" ")
    for word in text:
        if word in word_dict:
Y
change  
yinhaofeng 已提交
19
            continue
Y
yinhaofeng 已提交
20
        else:
Y
change  
yinhaofeng 已提交
21
            word_dict[word] = len(word_dict) + 1
Y
yinhaofeng 已提交
22 23 24 25 26 27 28 29 30

f = open("./zhidao", "r")
lines = f.readlines()
f.close()

lines = [line.strip().split("\t") for line in lines]

#建立以query为key,以负例为value的字典
neg_dict = {}
Y
change  
yinhaofeng 已提交
31
for line in lines:
Y
yinhaofeng 已提交
32 33 34 35 36 37 38 39
    if line[2] == "0":
        if line[0] in neg_dict:
            neg_dict[line[0]].append(line[1])
        else:
            neg_dict[line[0]] = [line[1]]

#建立以query为key,以正例为value的字典
pos_dict = {}
Y
change  
yinhaofeng 已提交
40
for line in lines:
Y
yinhaofeng 已提交
41 42 43 44 45 46
    if line[2] == "1":
        if line[0] in pos_dict:
            pos_dict[line[0]].append(line[1])
        else:
            pos_dict[line[0]] = [line[1]]

Y
change  
yinhaofeng 已提交
47 48 49 50 51 52 53 54 55 56
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query_list))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]

#获得训练集
train_set = []
for query in train_query:
Y
yinhaofeng 已提交
57 58 59 60
    for pos in pos_dict[query]:
        if query not in neg_dict:
            continue
        for neg in neg_dict[query]:
Y
change  
yinhaofeng 已提交
61 62
            train_set.append([query, pos, neg])
random.shuffle(train_set)
Y
yinhaofeng 已提交
63

Y
change  
yinhaofeng 已提交
64 65 66 67 68 69 70 71 72 73
#获得测试集
test_set = []
for query in test_query:
    for pos in pos_dict[query]:
        test_set.append([query, pos, 1])
    if query not in neg_dict:
        continue
    for neg in neg_dict[query]:
        test_set.append([query, neg, 0])
random.shuffle(test_set)
Y
yinhaofeng 已提交
74 75 76

#训练集中的query,pos,neg转化格式
f = open("train.txt", "w")
Y
change  
yinhaofeng 已提交
77
for line in train_set:
Y
yinhaofeng 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
    query = line[0].strip().split(" ")
    pos = line[1].strip().split(" ")
    neg = line[2].strip().split(" ")
    query_list = []
    for word in query:
        query_list.append(word_dict[word])
    pos_list = []
    for word in pos:
        pos_list.append(word_dict[word])
    neg_list = []
    for word in neg:
        neg_list.append(word_dict[word])
    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join([
        "1:" + str(x) for x in pos_list
    ]) + " " + ' '.join(["2:" + str(x) for x in neg_list]) + "\n")
f.close()

#测试集中的query和pos转化格式
f = open("test.txt", "w")
fa = open("label.txt", "w")
Y
change  
yinhaofeng 已提交
98
fb = open("testquery.txt", "w")
Y
yinhaofeng 已提交
99 100 101 102 103 104 105
for line in test_set:
    query = line[0].strip().split(" ")
    pos = line[1].strip().split(" ")
    label = line[2]
    query_list = []
    for word in query:
        query_list.append(word_dict[word])
Y
change  
yinhaofeng 已提交
106
    pos_list = []
Y
yinhaofeng 已提交
107 108 109 110
    for word in pos:
        pos_list.append(word_dict[word])
    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join(
        ["1:" + str(x) for x in pos_list]) + "\n")
Y
change  
yinhaofeng 已提交
111 112
    fa.write(str(label) + "\n")
    fb.write(','.join([str(x) for x in query_list]) + "\n")
Y
yinhaofeng 已提交
113 114
f.close()
fa.close()
Y
change  
yinhaofeng 已提交
115
fb.close()