#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import argparse import os import cPickle from utils import logger """ This script will output 2 files: 1. feature_dict.pkl 2. item_freq.pkl """ class FeatureGenerator(object): """ Encode feature values with low-frequency filtering. """ def __init__(self, feat_appear_limit=20): """ @feat_appear_limit: int """ self._dic = None # feature value --> id self._count = None # numbers of appearances of feature values self._feat_appear_limit = feat_appear_limit def add_feat_val(self, feat_val): """ Add feature values and count numbers of its appearance. """ if self._count is None: self._count = {'': 0} if feat_val == "NULL": feat_val = '' if feat_val not in self._count: self._count[feat_val] = 1 else: self._count[feat_val] += 1 self._count[''] += 1 def _filter_feat(self): """ Filter low-frequency feature values. """ self._items = filter(lambda x: x[1] > self._feat_appear_limit, self._count.items()) self._items.sort(key=lambda x: x[1], reverse=True) def _build_dict(self): """ Build feature values --> ids dict. """ self._dic = {} self._filter_feat() for i in xrange(len(self._items)): self._dic[self._items[i][0]] = i self.dim = len(self._dic) def get_feat_id(self, feat_val): """ Get id of feature value after encoding. """ # build dict if self._dic is None: self._build_dict() # find id if feat_val in self._dic: return self._dic[feat_val] else: return self._dic[''] def get_dim(self): """ Get dim. """ # build dict if self._dic is None: self._build_dict() return len(self._dic) def get_dict(self): """ Get dict. """ # build dict if self._dic is None: self._build_dict() return self._dic def get_total_count(self): """ Compute total num of count. """ total_count = 0 for i in xrange(len(self._items)): feat_val = self._items[i][0] c = self._items[i][1] total_count += c return total_count def count_iterator(self): """ Iterate feature values and its num of appearance. """ for i in xrange(len(self._items)): yield self._items[i][0], self._items[i][1] def __repr__(self): """ """ return '' % self._dim def scan_build_dict(data_path, features_dict): """ Scan the raw data and add all feature values. """ logger.info('scan data set') with open(data_path, 'r') as f: for (line_id, line) in enumerate(f): fields = line.strip('\n').split('\t') user_id = fields[0] province = fields[1] features_dict['province'].add_feat_val(province) city = fields[2] features_dict['city'].add_feat_val(city) item_infos = fields[3] phone = fields[4] features_dict['phone'].add_feat_val(phone) for item_info in item_infos.split(";"): item_info_array = item_info.split(":") item = item_info_array[0] features_dict['history_clicked_items'].add_feat_val(item) features_dict['user_id'].add_feat_val(user_id) category = item_info_array[1] features_dict['history_clicked_categories'].add_feat_val( category) tags = item_info_array[2] for tag in tags.split("_"): features_dict['history_clicked_tags'].add_feat_val(tag) def parse_args(): """ parse arguments """ parser = argparse.ArgumentParser( description="PaddlePaddle Youtube Recall Model Example") parser.add_argument( '--train_set_path', type=str, required=True, help="path of the train set") parser.add_argument( '--output_dir', type=str, required=True, help="directory to output") parser.add_argument( '--feat_appear_limit', type=int, default=20, help="the minimum number of feature values appears (default: 20)") return parser.parse_args() if __name__ == '__main__': args = parse_args() # check argument assert os.path.exists( args.train_set_path), 'The train set path does not exist.' # features used features = [ 'user_id', 'province', 'city', 'phone', 'history_clicked_items', 'history_clicked_tags', 'history_clicked_categories' ] # init feature generators features_dict = {} for feature in features: features_dict[feature] = FeatureGenerator( feat_appear_limit=args.feat_appear_limit) # scan data for building dict scan_build_dict(args.train_set_path, features_dict) # generate feature_dict.pkl feature_encoding_dict = {} for feature in features: d = features_dict[feature].get_dict() feature_encoding_dict[feature] = d logger.info('Feature:%s, dimension is %d' % (feature, len(d))) output_dict_path = os.path.join(args.output_dir, 'feature_dict.pkl') with open(output_dict_path, "w") as f: cPickle.dump(feature_encoding_dict, f, -1) # generate item_freq.pkl item_freq_list = [] g = features_dict['history_clicked_items'] total_count = g.get_total_count() for feat_val, feat_count in g.count_iterator(): item_freq_list.append(float(feat_count) / total_count) logger.info('item_freq, dimension is %d' % (len(item_freq_list))) output_item_freq_path = os.path.join(args.output_dir, 'item_freq.pkl') with open(output_item_freq_path, "w") as f: cPickle.dump(item_freq_list, f, -1) logger.info('Complete!')