prepare_data.py

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import Counter
from urllib2 import urlopen
import argparse
import os
import random
import logging

import numpy as np

logging.basicConfig(level=logging.INFO)
data_url = 'https://archive.ics.uci.edu/ml/machine' \
           '-learning-databases/housing/housing.data'
raw_data = 'housing.data'
train_data = 'housing.train.npy'
test_data = 'housing.test.npy'
feature_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT'
]
root_dir = os.path.abspath(os.pardir)


def maybe_download(url, file_path):
    if not os.path.exists(file_path):
        logging.info('data doesn\'t exist on %s, download from [%s]' %
                     (file_path, url))
        resp = urlopen(url).read()
        with open(file_path, 'w') as f:
            f.write(resp)

    logging.info('got raw housing data')


def save_list():
    with open('train.list', 'w') as f:
        f.write('data/' + train_data + '\n')
    with open('test.list', 'w') as f:
        f.write('data/' + test_data + '\n')


def feature_range(maximums, minimums):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    feature_num = len(maximums)
    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
    ax.set_title('feature scale')
    plt.xticks(range(feature_num), feature_names)
    plt.xlim([-1, feature_num])
    fig.set_figheight(6)
    fig.set_figwidth(10)
    fig.savefig('%s/image/ranges.png' % root_dir, dpi=48)
    plt.close(fig)


def preprocess(file_path, feature_num=14, shuffle=False, ratio=0.8):
    data = np.fromfile(file_path, sep=' ')
    data = data.reshape(data.shape[0] / feature_num, feature_num)
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
    for i in xrange(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    if shuffle:
        np.random.shuffle(data)
    offset = int(data.shape[0] * ratio)
    np.save(train_data, data[:offset])
    logging.info('saved training data to %s' % train_data)
    np.save(test_data, data[offset:])
    logging.info('saved test data to %s' % test_data)
    save_list()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='download boston housing price data set and preprocess the data(normalization and split dataset)'
    )
    parser.add_argument(
        '-r',
        '--ratio',
        dest='ratio',
        default='0.8',
        help='ratio of data used for training')
    parser.add_argument(
        '-s',
        '--shuffle',
        dest='shuffle',
        default='0',
        choices={'1', '0'},
        help='shuffle the data before splitting, 1=shuffle, 0=do not shuffle')
    args = parser.parse_args()

    maybe_download(data_url, raw_data)
    preprocess(raw_data, shuffle=int(args.shuffle), ratio=float(args.ratio))