提交 d86fb1d1 编写于 作者: D dzhwinter

"precommit format with github style"

上级 82eb0fe4
...@@ -23,7 +23,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20 ...@@ -23,7 +23,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
""" """
import os import os
import random import random
import functools import functools
...@@ -31,25 +30,24 @@ import rarfile ...@@ -31,25 +30,24 @@ import rarfile
from common import download from common import download
import numpy as np import numpy as np
# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
MD5 = "7be1640ae95c6408dab0ae7207bdc706" MD5 = "7be1640ae95c6408dab0ae7207bdc706"
def __initialize_meta_info__(): def __initialize_meta_info__():
""" """
download and extract the MQ2007 dataset download and extract the MQ2007 dataset
""" """
fn = fetch() fn = fetch()
rar = rarfile.RarFile(fn) rar = rarfile.RarFile(fn)
dirpath = os.path.dirname(fn) dirpath = os.path.dirname(fn)
rar.extractall(path=dirpath) rar.extractall(path=dirpath)
return dirpath return dirpath
class Query(object): class Query(object):
""" """
queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors
Parameters: Parameters:
...@@ -63,79 +61,86 @@ class Query(object): ...@@ -63,79 +61,86 @@ class Query(object):
description : string description : string
comment section in query doc pair data comment section in query doc pair data
""" """
def __init__(self, query_id=-1, relevance_score=-1,
feature_vector=None, description=""):
self.query_id = query_id
self.relevance_score = relevance_score
if feature_vector is None:
self.feature_vector = []
else:
self.feature_vector = feature_vector
self.description = description
def __str__(self): def __init__(self,
string = "%s %s %s" %(str(self.relevance_score), str(self.query_id), " ".join(str(f) for f in self.feature_vector)) query_id=-1,
return string relevance_score=-1,
feature_vector=None,
description=""):
self.query_id = query_id
self.relevance_score = relevance_score
if feature_vector is None:
self.feature_vector = []
else:
self.feature_vector = feature_vector
self.description = description
# @classmethod def __str__(self):
def _parse_(self, text): string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
""" " ".join(str(f) for f in self.feature_vector))
return string
# @classmethod
def _parse_(self, text):
"""
parse line into Query parse line into Query
""" """
comment_position = text.find('#') comment_position = text.find('#')
line = text[:comment_position].strip() line = text[:comment_position].strip()
self.description = text[comment_position+1:].strip() self.description = text[comment_position + 1:].strip()
parts = line.split() parts = line.split()
assert(len(parts) == 48), "expect 48 space split parts, get %d" %(len(parts)) assert (len(parts) == 48), "expect 48 space split parts, get %d" % (
# format : 0 qid:10 1:0.000272 2:0.000000 .... len(parts))
self.relevance_score = int(parts[0]) # format : 0 qid:10 1:0.000272 2:0.000000 ....
self.query_id = int(parts[1].split(':')[1]) self.relevance_score = int(parts[0])
for p in parts[2:]: self.query_id = int(parts[1].split(':')[1])
pair = p.split(':') for p in parts[2:]:
self.feature_vector.append(float(pair[1])) pair = p.split(':')
return self self.feature_vector.append(float(pair[1]))
return self
class QueryList(object): class QueryList(object):
""" """
group query into list, every item in list is a Query group query into list, every item in list is a Query
""" """
def __init__(self, querylist=None):
self.query_id = -1 def __init__(self, querylist=None):
if querylist is None: self.query_id = -1
self.querylist = [] if querylist is None:
else: self.querylist = []
self.querylist = querylist else:
for query in self.querylist: self.querylist = querylist
for query in self.querylist:
if self.query_id == -1:
self.query_id = query.query_id
else:
if self.query_id != query.query_id:
raise ValueError("query in list must be same query_id")
def __iter__(self):
for query in self.querylist:
yield query
def __len__(self):
return len(self.querylist)
def _correct_ranking_(self):
if self.querylist is None:
return
self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
def _add_query(self, query):
if self.query_id == -1: if self.query_id == -1:
self.query_id = query.query_id self.query_id = query.query_id
else: else:
if self.query_id != query.query_id: if self.query_id != query.query_id:
raise ValueError("query in list must be same query_id") raise ValueError("query in list must be same query_id")
self.querylist.append(query)
def __iter__(self):
for query in self.querylist:
yield query
def __len__(self):
return len(self.querylist)
def _correct_ranking_(self):
if self.querylist is None:
return
self.querylist.sort(key=lambda x:x.relevance_score, reverse=True)
def _add_query(self, query):
if self.query_id == -1:
self.query_id = query.query_id
else:
if self.query_id != query.query_id:
raise ValueError("query in list must be same query_id")
self.querylist.append(query)
def gen_pair(querylist, partial_order="full"): def gen_pair(querylist, partial_order="full"):
""" """
gen pair for pair-wise learning to rank algorithm gen pair for pair-wise learning to rank algorithm
Paramters: Paramters:
-------- --------
...@@ -149,35 +154,41 @@ def gen_pair(querylist, partial_order="full"): ...@@ -149,35 +154,41 @@ def gen_pair(querylist, partial_order="full"):
query_left : np.array, shape=(1, feature_dimension) query_left : np.array, shape=(1, feature_dimension)
query_right : same as left query_right : same as left
""" """
if not isinstance(querylist, QueryList): if not isinstance(querylist, QueryList):
querylist = QueryList(querylist) querylist = QueryList(querylist)
querylist._correct_ranking_() querylist._correct_ranking_()
# C(n,2) # C(n,2)
if partial_order == "full": if partial_order == "full":
for i, query_left in enumerate(querylist): for i, query_left in enumerate(querylist):
for j, query_right in enumerate(querylist): for j, query_right in enumerate(querylist):
if query_left.relevance_score > query_right.relevance_score: if query_left.relevance_score > query_right.relevance_score:
yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) yield 1, np.array(query_left.feature_vector), np.array(
else: query_right.feature_vector)
yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) else:
yield 1, np.array(query_left.feature_vector), np.array(
elif partial_order == "neighbour": query_right.feature_vector)
# C(n)
k = 0 elif partial_order == "neighbour":
while k < len(querylist)-1: # C(n)
query_left = querylist[k] k = 0
query_right = querylist[k+1] while k < len(querylist) - 1:
if query_left.relevance_score > query_right.relevance_score: query_left = querylist[k]
yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) query_right = querylist[k + 1]
else: if query_left.relevance_score > query_right.relevance_score:
yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) yield 1, np.array(query_left.feature_vector), np.array(
k += 1 query_right.feature_vector)
else: else:
raise ValueError("unsupport parameter of partial_order, Only can be neighbour or full") yield 1, np.array(query_left.feature_vector), np.array(
query_right.feature_vector)
k += 1
else:
raise ValueError(
"unsupport parameter of partial_order, Only can be neighbour or full"
)
def gen_list(querylist): def gen_list(querylist):
""" """
gen item in list for list-wise learning to rank algorithm gen item in list for list-wise learning to rank algorithm
Paramters: Paramters:
-------- --------
...@@ -188,41 +199,39 @@ def gen_list(querylist): ...@@ -188,41 +199,39 @@ def gen_list(querylist):
label : np.array, shape=(samples_num, ) label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension) querylist : np.array, shape=(samples_num, feature_dimension)
""" """
if not isinstance(querylist, QueryList): if not isinstance(querylist, QueryList):
querylist = QueryList(querylist) querylist = QueryList(querylist)
querylist._correct_ranking_() # querylist._correct_ranking_()
relevance_score_list = [query.relevance_score for query in querylist] relevance_score_list = [query.relevance_score for query in querylist]
feature_vector_list = [query.feature_vector for query in querylist] feature_vector_list = [query.feature_vector for query in querylist]
# yield np.array(relevance_score_list).T, np.array(feature_vector_list) yield np.array(relevance_score_list).T, np.array(feature_vector_list)
for i in range(len(querylist)):
yield relevance_score_list[i], np.array(feature_vector_list[i])
def load_from_text(filepath, shuffle=True, fill_missing=-1): def load_from_text(filepath, shuffle=True, fill_missing=-1):
""" """
parse data file into querys parse data file into querys
""" """
prev_query_id = -1; prev_query_id = -1
querylists = [] querylists = []
querylist = None querylist = None
fn = __initialize_meta_info__() fn = __initialize_meta_info__()
with open(os.path.join(fn, filepath)) as f: with open(os.path.join(fn, filepath)) as f:
for line in f: for line in f:
query = Query() query = Query()
query = query._parse_(line) query = query._parse_(line)
if query.query_id != prev_query_id: if query.query_id != prev_query_id:
if querylist is not None: if querylist is not None:
querylists.append(querylist) querylists.append(querylist)
querylist = QueryList() querylist = QueryList()
prev_query_id = query.query_id prev_query_id = query.query_id
querylist._add_query(query) querylist._add_query(query)
if shuffle == True: if shuffle == True:
random.shuffle(querylists) random.shuffle(querylists)
return querylists return querylists
def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1): def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
""" """
Parameters Parameters
-------- --------
filename : string filename : string
...@@ -235,23 +244,27 @@ def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1): ...@@ -235,23 +244,27 @@ def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
label query_left, query_right # format = "pairwise" label query_left, query_right # format = "pairwise"
label querylist # format = "listwise" label querylist # format = "listwise"
""" """
querylists = load_from_text(filepath, shuffle=shuffle, fill_missing=fill_missing) querylists = load_from_text(
for querylist in querylists: filepath, shuffle=shuffle, fill_missing=fill_missing)
if format == "pairwise": for querylist in querylists:
for pair in gen_pair(querylist): if format == "pairwise":
yield pair for pair in gen_pair(querylist):
elif format == "listwise": yield pair
# yield next(gen_list(querylist)) elif format == "listwise":
for instance in gen_list(querylist): yield next(gen_list(querylist))
yield instance
train = functools.partial(__reader__,filepath="MQ2007/MQ2007/Fold1/train.txt") train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
def fetch(): def fetch():
return download(URL, "MQ2007", MD5) return download(URL, "MQ2007", MD5)
if __name__ == "__main__":
fetch()
if __name__ == "__main__":
fetch()
for i, (score,
samples) in enumerate(train(
format="listwise", shuffle=False)):
np.savetxt("query_%d" % (i), score, fmt="%.2f")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册