diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py index 2e8a2f9b089a8d405cc18b415cff41e1e70a4edf..1122ca88bf5133baf01d6123975dcefdf7fdf138 100644 --- a/python/paddle/v2/dataset/mq2007.py +++ b/python/paddle/v2/dataset/mq2007.py @@ -23,7 +23,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20 """ - import os import random import functools @@ -31,25 +30,24 @@ import rarfile from common import download import numpy as np - # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" MD5 = "7be1640ae95c6408dab0ae7207bdc706" def __initialize_meta_info__(): - """ + """ download and extract the MQ2007 dataset """ - fn = fetch() - rar = rarfile.RarFile(fn) - dirpath = os.path.dirname(fn) - rar.extractall(path=dirpath) - return dirpath + fn = fetch() + rar = rarfile.RarFile(fn) + dirpath = os.path.dirname(fn) + rar.extractall(path=dirpath) + return dirpath class Query(object): - """ + """ queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors Parameters: @@ -63,79 +61,86 @@ class Query(object): description : string comment section in query doc pair data """ - def __init__(self, query_id=-1, relevance_score=-1, - feature_vector=None, description=""): - self.query_id = query_id - self.relevance_score = relevance_score - if feature_vector is None: - self.feature_vector = [] - else: - self.feature_vector = feature_vector - self.description = description - def __str__(self): - string = "%s %s %s" %(str(self.relevance_score), str(self.query_id), " ".join(str(f) for f in self.feature_vector)) - return string + def __init__(self, + query_id=-1, + relevance_score=-1, + feature_vector=None, + description=""): + self.query_id = query_id + self.relevance_score = relevance_score + if feature_vector is None: + self.feature_vector = [] + else: + self.feature_vector = feature_vector + self.description = description - # @classmethod - def _parse_(self, text): - """ + def __str__(self): + string = "%s %s %s" % (str(self.relevance_score), str(self.query_id), + " ".join(str(f) for f in self.feature_vector)) + return string + + # @classmethod + def _parse_(self, text): + """ parse line into Query """ - comment_position = text.find('#') - line = text[:comment_position].strip() - self.description = text[comment_position+1:].strip() - parts = line.split() - assert(len(parts) == 48), "expect 48 space split parts, get %d" %(len(parts)) - # format : 0 qid:10 1:0.000272 2:0.000000 .... - self.relevance_score = int(parts[0]) - self.query_id = int(parts[1].split(':')[1]) - for p in parts[2:]: - pair = p.split(':') - self.feature_vector.append(float(pair[1])) - return self + comment_position = text.find('#') + line = text[:comment_position].strip() + self.description = text[comment_position + 1:].strip() + parts = line.split() + assert (len(parts) == 48), "expect 48 space split parts, get %d" % ( + len(parts)) + # format : 0 qid:10 1:0.000272 2:0.000000 .... + self.relevance_score = int(parts[0]) + self.query_id = int(parts[1].split(':')[1]) + for p in parts[2:]: + pair = p.split(':') + self.feature_vector.append(float(pair[1])) + return self + class QueryList(object): - """ + """ group query into list, every item in list is a Query """ - def __init__(self, querylist=None): - self.query_id = -1 - if querylist is None: - self.querylist = [] - else: - self.querylist = querylist - for query in self.querylist: + + def __init__(self, querylist=None): + self.query_id = -1 + if querylist is None: + self.querylist = [] + else: + self.querylist = querylist + for query in self.querylist: + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + + def __iter__(self): + for query in self.querylist: + yield query + + def __len__(self): + return len(self.querylist) + + def _correct_ranking_(self): + if self.querylist is None: + return + self.querylist.sort(key=lambda x: x.relevance_score, reverse=True) + + def _add_query(self, query): if self.query_id == -1: - self.query_id = query.query_id + self.query_id = query.query_id else: - if self.query_id != query.query_id: - raise ValueError("query in list must be same query_id") - - def __iter__(self): - for query in self.querylist: - yield query - - def __len__(self): - return len(self.querylist) - - def _correct_ranking_(self): - if self.querylist is None: - return - self.querylist.sort(key=lambda x:x.relevance_score, reverse=True) - - def _add_query(self, query): - if self.query_id == -1: - self.query_id = query.query_id - else: - if self.query_id != query.query_id: - raise ValueError("query in list must be same query_id") - self.querylist.append(query) - + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + self.querylist.append(query) def gen_pair(querylist, partial_order="full"): - """ + """ gen pair for pair-wise learning to rank algorithm Paramters: -------- @@ -149,35 +154,41 @@ def gen_pair(querylist, partial_order="full"): query_left : np.array, shape=(1, feature_dimension) query_right : same as left """ - if not isinstance(querylist, QueryList): - querylist = QueryList(querylist) - querylist._correct_ranking_() - # C(n,2) - if partial_order == "full": - for i, query_left in enumerate(querylist): - for j, query_right in enumerate(querylist): - if query_left.relevance_score > query_right.relevance_score: - yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) - else: - yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) - - elif partial_order == "neighbour": - # C(n) - k = 0 - while k < len(querylist)-1: - query_left = querylist[k] - query_right = querylist[k+1] - if query_left.relevance_score > query_right.relevance_score: - yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) - else: - yield 1, np.array(query_left.feature_vector), np.array(query_right.feature_vector) - k += 1 - else: - raise ValueError("unsupport parameter of partial_order, Only can be neighbour or full") + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + # C(n,2) + if partial_order == "full": + for i, query_left in enumerate(querylist): + for j, query_right in enumerate(querylist): + if query_left.relevance_score > query_right.relevance_score: + yield 1, np.array(query_left.feature_vector), np.array( + query_right.feature_vector) + else: + yield 1, np.array(query_left.feature_vector), np.array( + query_right.feature_vector) + + elif partial_order == "neighbour": + # C(n) + k = 0 + while k < len(querylist) - 1: + query_left = querylist[k] + query_right = querylist[k + 1] + if query_left.relevance_score > query_right.relevance_score: + yield 1, np.array(query_left.feature_vector), np.array( + query_right.feature_vector) + else: + yield 1, np.array(query_left.feature_vector), np.array( + query_right.feature_vector) + k += 1 + else: + raise ValueError( + "unsupport parameter of partial_order, Only can be neighbour or full" + ) + - def gen_list(querylist): - """ + """ gen item in list for list-wise learning to rank algorithm Paramters: -------- @@ -188,41 +199,39 @@ def gen_list(querylist): label : np.array, shape=(samples_num, ) querylist : np.array, shape=(samples_num, feature_dimension) """ - if not isinstance(querylist, QueryList): - querylist = QueryList(querylist) - querylist._correct_ranking_() - relevance_score_list = [query.relevance_score for query in querylist] - feature_vector_list = [query.feature_vector for query in querylist] - # yield np.array(relevance_score_list).T, np.array(feature_vector_list) - for i in range(len(querylist)): - yield relevance_score_list[i], np.array(feature_vector_list[i]) + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + # querylist._correct_ranking_() + relevance_score_list = [query.relevance_score for query in querylist] + feature_vector_list = [query.feature_vector for query in querylist] + yield np.array(relevance_score_list).T, np.array(feature_vector_list) def load_from_text(filepath, shuffle=True, fill_missing=-1): - """ + """ parse data file into querys """ - prev_query_id = -1; - querylists = [] - querylist = None - fn = __initialize_meta_info__() - with open(os.path.join(fn, filepath)) as f: - for line in f: - query = Query() - query = query._parse_(line) - if query.query_id != prev_query_id: - if querylist is not None: - querylists.append(querylist) - querylist = QueryList() - prev_query_id = query.query_id - querylist._add_query(query) - if shuffle == True: - random.shuffle(querylists) - return querylists + prev_query_id = -1 + querylists = [] + querylist = None + fn = __initialize_meta_info__() + with open(os.path.join(fn, filepath)) as f: + for line in f: + query = Query() + query = query._parse_(line) + if query.query_id != prev_query_id: + if querylist is not None: + querylists.append(querylist) + querylist = QueryList() + prev_query_id = query.query_id + querylist._add_query(query) + if shuffle == True: + random.shuffle(querylists) + return querylists def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1): - """ + """ Parameters -------- filename : string @@ -235,23 +244,27 @@ def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1): label query_left, query_right # format = "pairwise" label querylist # format = "listwise" """ - querylists = load_from_text(filepath, shuffle=shuffle, fill_missing=fill_missing) - for querylist in querylists: - if format == "pairwise": - for pair in gen_pair(querylist): - yield pair - elif format == "listwise": - # yield next(gen_list(querylist)) - for instance in gen_list(querylist): - yield instance - -train = functools.partial(__reader__,filepath="MQ2007/MQ2007/Fold1/train.txt") + querylists = load_from_text( + filepath, shuffle=shuffle, fill_missing=fill_missing) + for querylist in querylists: + if format == "pairwise": + for pair in gen_pair(querylist): + yield pair + elif format == "listwise": + yield next(gen_list(querylist)) + + +train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt") test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") def fetch(): - return download(URL, "MQ2007", MD5) + return download(URL, "MQ2007", MD5) -if __name__ == "__main__": - fetch() +if __name__ == "__main__": + fetch() + for i, (score, + samples) in enumerate(train( + format="listwise", shuffle=False)): + np.savetxt("query_%d" % (i), score, fmt="%.2f")