feature_preprocess.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

 
"""
File: feature_preprocess.py
"""
import sys
import random
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import coo_matrix

from datasets.house_price.baseline_sklearn import CityInfo


def parse_line(line, labels, data, row, col, radius, city_info, num_row,
        max_house_num, max_public_num):
    """
        parse line
    """
    ll = line.strip('\r\n').split('\t')
    labels.append(ll[0].split()[0])
    business = int(ll[1].split()[0])
    dis_info = ll[3].split()
    if business >= 0 and business < city_info.business_num:
        data.append(1)
        row.append(num_row)
        col.append(business)

    idx = 0
    h_num = 0
    p_num = 0
    for i in ll[2].split():
        if float(dis_info[idx]) > radius:
            idx += 1
            continue
        if ':' in i: 
            if h_num > max_house_num:
                continue
            h_num += 1
            data.append(1)
            row.append(num_row)
            col.append(city_info.business_num + int(i.split(':')[0]) - 1)
        else:
            if p_num > max_public_num:
                break
            p_num += 1
            data.append(1)
            row.append(num_row)
            col.append(city_info.business_num + city_info.house_num + int(i) - 1)
        idx += 1
             

if __name__ == '__main__':
    test = sys.argv[1]
    radius = float(sys.argv[2])
    max_house_num = float(sys.argv[3])
    max_public_num = float(sys.argv[4])
    city_info = CityInfo(sys.argv[5])

    train_data = []
    train_row = []
    train_col = []
    train_labels = []
 
    num_row = 0
    for line in sys.stdin:
        parse_line(line, train_labels, train_data, train_row, train_col, radius, city_info,
                num_row, max_house_num, max_public_num) 
        num_row += 1 

    coo = coo_matrix((train_data, (train_row, train_col)),
            shape=(num_row, city_info.business_num + city_info.house_num + city_info.public_num))
    
    svd = TruncatedSVD(n_components=200, n_iter=10, random_state=0)
    svd.fit(coo.tocsr())

    x_train = svd.transform(coo.tocsr())
    for i in range(len(x_train)):
        print("train %s %s" % (train_labels[i], " ".join(map(str, x_train[i]))))
    
    test_data = []
    test_row = []
    test_col = []
    test_labels = []
      
    with open(test, 'r') as f:
        num_row = 0
        for line in f:
            parse_line(line, test_labels, test_data, test_row, test_col, radius, city_info,
                    num_row, max_house_num, max_public_num) 
            num_row += 1 

    coo = coo_matrix((test_data, (test_row, test_col)),
            shape=(num_row, city_info.business_num + city_info.house_num + city_info.public_num))
    x_test = svd.transform(coo.tocsr())
    for i in range(len(x_test)):
        print("test %s %s" % (test_labels[i], " ".join(map(str, x_test[i]))))