data_processor.py 6.1 KB
Newer Older
zhaoyijin666's avatar
zhaoyijin666 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import argparse
import os
import cPickle

from utils import logger
"""
This script will output 2 files:
1. feature_dict.pkl
2. item_freq.pkl
"""


class FeatureGenerator(object):
    """
    Encode feature values with low-frequency filtering.
    """

    def __init__(self, feat_appear_limit=20):
        """
        @feat_appear_limit: int
        """
        self._dic = None  # feature value --> id
        self._count = None  # numbers of appearances of feature values
        self._feat_appear_limit = feat_appear_limit

    def add_feat_val(self, feat_val):
        """
        Add feature values and count numbers of its appearance. 
        """
        if self._count is None:
            self._count = {'<unk>': 0}
        if feat_val == "NULL":
            feat_val = '<unk>'
        if feat_val not in self._count:
            self._count[feat_val] = 1
        else:
            self._count[feat_val] += 1
            self._count['<unk>'] += 1

    def _filter_feat(self):
        """
        Filter low-frequency feature values.
        """
        self._items = filter(lambda x: x[1] > self._feat_appear_limit,
                             self._count.items())
        self._items.sort(key=lambda x: x[1], reverse=True)

    def _build_dict(self):
        """
        Build feature values --> ids dict.
        """
        self._dic = {}
        self._filter_feat()
        for i in xrange(len(self._items)):
            self._dic[self._items[i][0]] = i
        self.dim = len(self._dic)

    def get_feat_id(self, feat_val):
        """
        Get id of feature value after encoding.
        """
        # build dict
        if self._dic is None:
            self._build_dict()

        # find id
        if feat_val in self._dic:
            return self._dic[feat_val]
        else:
            return self._dic['<unk>']

    def get_dim(self):
        """
        Get dim.
        """
        # build dict
        if self._dic is None:
            self._build_dict()
        return len(self._dic)

    def get_dict(self):
        """
        Get dict.
        """
        # build dict
        if self._dic is None:
            self._build_dict()
        return self._dic

    def get_total_count(self):
        """
        Compute total num of count.
        """
        total_count = 0
        for i in xrange(len(self._items)):
            feat_val = self._items[i][0]
            c = self._items[i][1]
            total_count += c
        return total_count

    def count_iterator(self):
        """
        Iterate feature values and its num of appearance.
        """
        for i in xrange(len(self._items)):
            yield self._items[i][0], self._items[i][1]

    def __repr__(self):
        """
        """
        return '<FeatureGenerator %d>' % self._dim


def scan_build_dict(data_path, features_dict):
    """
    Scan the raw data and add all feature values.
    """
    logger.info('scan data set')

    with open(data_path, 'r') as f:
        for (line_id, line) in enumerate(f):
            fields = line.strip('\n').split('\t')
            user_id = fields[0]
            province = fields[1]
            features_dict['province'].add_feat_val(province)
            city = fields[2]
            features_dict['city'].add_feat_val(city)
            item_infos = fields[3]
            phone = fields[4]
            features_dict['phone'].add_feat_val(phone)
            for item_info in item_infos.split(";"):
                item_info_array = item_info.split(":")
                item = item_info_array[0]
                features_dict['history_clicked_items'].add_feat_val(item)
                features_dict['user_id'].add_feat_val(user_id)
                category = item_info_array[1]
                features_dict['history_clicked_categories'].add_feat_val(
                    category)
                tags = item_info_array[2]
                for tag in tags.split("_"):
                    features_dict['history_clicked_tags'].add_feat_val(tag)


def parse_args():
    """
    parse arguments
    """
    parser = argparse.ArgumentParser(
        description="PaddlePaddle Youtube Recall Model Example")
    parser.add_argument(
        '--train_set_path',
        type=str,
        required=True,
        help="path of the train set")
    parser.add_argument(
        '--output_dir', type=str, required=True, help="directory to output")
    parser.add_argument(
        '--feat_appear_limit',
        type=int,
        default=20,
        help="the minimum number of feature values appears (default: 20)")
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()

    # check argument
    assert os.path.exists(
        args.train_set_path), 'The train set path does not exist.'

    # features used
    features = [
        'user_id', 'province', 'city', 'phone', 'history_clicked_items',
        'history_clicked_tags', 'history_clicked_categories'
    ]

    # init feature generators
    features_dict = {}
    for feature in features:
        features_dict[feature] = FeatureGenerator(
            feat_appear_limit=args.feat_appear_limit)

    # scan data for building dict
    scan_build_dict(args.train_set_path, features_dict)

    # generate feature_dict.pkl
    feature_encoding_dict = {}
    for feature in features:
        d = features_dict[feature].get_dict()
        feature_encoding_dict[feature] = d
        logger.info('Feature:%s, dimension is %d' % (feature, len(d)))
    output_dict_path = os.path.join(args.output_dir, 'feature_dict.pkl')
    with open(output_dict_path, "w") as f:
        cPickle.dump(feature_encoding_dict, f, -1)

    # generate item_freq.pkl
    item_freq_list = []
    g = features_dict['history_clicked_items']
    total_count = g.get_total_count()
    for feat_val, feat_count in g.count_iterator():
        item_freq_list.append(float(feat_count) / total_count)
    logger.info('item_freq, dimension is %d' % (len(item_freq_list)))
    output_item_freq_path = os.path.join(args.output_dir, 'item_freq.pkl')
    with open(output_item_freq_path, "w") as f:
        cPickle.dump(item_freq_list, f, -1)

    logger.info('Complete!')