uci_housing.py 4.9 KB
Newer Older
D
dangqingqing 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
Yu Yang 已提交
14 15 16
"""
UCI Housing dataset.

G
gongweibao 已提交
17
This module will download dataset from
Q
qijun 已提交
18
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
Q
qijun 已提交
19
parse training set and test set into paddle reader creators.
Y
Yu Yang 已提交
20
"""
D
dangqingqing 已提交
21

22
from __future__ import print_function
T
tangwei12 已提交
23

D
dangqingqing 已提交
24
import numpy as np
M
minqiyang 已提交
25
import six
T
tangwei12 已提交
26 27
import tempfile
import tarfile
D
dangqingqing 已提交
28
import os
29
import paddle.dataset.common
30
import paddle.utils.deprecated as deprecated
D
dangqingqing 已提交
31

32 33
__all__ = []

Y
Yancey1989 已提交
34
URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
D
dangqingqing 已提交
35 36 37
MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
38
    'PTRATIO', 'B', 'LSTAT'
D
dangqingqing 已提交
39 40 41 42
]

UCI_TRAIN_DATA = None
UCI_TEST_DATA = None
T
tangwei12 已提交
43 44 45

FLUID_URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fluid/fit_a_line.fluid.tar'
FLUID_MD5_MODEL = '6e6dd637ccd5993961f68bfbde46090b'
D
dangqingqing 已提交
46

47

D
dangqingqing 已提交
48 49 50 51 52 53
def feature_range(maximums, minimums):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    feature_num = len(maximums)
54 55 56 57
    ax.bar(list(range(feature_num)),
           maximums - minimums,
           color='r',
           align='center')
D
dangqingqing 已提交
58
    ax.set_title('feature scale')
59
    plt.xticks(list(range(feature_num)), feature_names)
D
dangqingqing 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
    plt.xlim([-1, feature_num])
    fig.set_figheight(6)
    fig.set_figwidth(10)
    if not os.path.exists('./image'):
        os.makedirs('./image')
    fig.savefig('image/ranges.png', dpi=48)
    plt.close(fig)


def load_data(filename, feature_num=14, ratio=0.8):
    global UCI_TRAIN_DATA, UCI_TEST_DATA
    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
        return

    data = np.fromfile(filename, sep=' ')
M
minqiyang 已提交
75
    data = data.reshape(data.shape[0] // feature_num, feature_num)
D
dangqingqing 已提交
76 77
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
78 79
    # if you want to print the distribution of input data, you could use function of feature_range
    #feature_range(maximums[:-1], minimums[:-1])
M
minqiyang 已提交
80
    for i in six.moves.range(feature_num - 1):
D
dangqingqing 已提交
81 82 83 84 85 86
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]
    UCI_TEST_DATA = data[offset:]


87 88 89 90
@deprecated(
    since="2.0.0",
    update_to="paddle.text.datasets.UCIHousing",
    reason="Please use new dataset API which supports paddle.io.DataLoader")
D
dangqingqing 已提交
91
def train():
Q
qijun 已提交
92
    """
Q
qijun 已提交
93
    UCI_HOUSING training set creator.
Q
qijun 已提交
94

Q
qijun 已提交
95 96
    It returns a reader creator, each sample in the reader is features after
    normalization and price number.
Q
qijun 已提交
97

Q
qijun 已提交
98
    :return: Training reader creator
Q
qijun 已提交
99 100
    :rtype: callable
    """
D
dangqingqing 已提交
101
    global UCI_TRAIN_DATA
102
    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
D
dangqingqing 已提交
103 104 105 106 107 108 109 110

    def reader():
        for d in UCI_TRAIN_DATA:
            yield d[:-1], d[-1:]

    return reader


111 112 113 114
@deprecated(
    since="2.0.0",
    update_to="paddle.text.datasets.UCIHousing",
    reason="Please use new dataset API which supports paddle.io.DataLoader")
D
dangqingqing 已提交
115
def test():
Q
qijun 已提交
116 117 118
    """
    UCI_HOUSING test set creator.

Q
qijun 已提交
119 120
    It returns a reader creator, each sample in the reader is features after
    normalization and price number.
Q
qijun 已提交
121 122 123 124

    :return: Test reader creator
    :rtype: callable
    """
D
dangqingqing 已提交
125
    global UCI_TEST_DATA
126
    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
D
dangqingqing 已提交
127 128 129 130 131 132

    def reader():
        for d in UCI_TEST_DATA:
            yield d[:-1], d[-1:]

    return reader
Y
Yancey1989 已提交
133

T
tangwei12 已提交
134

T
tangwei12 已提交
135
def fluid_model():
T
tangwei12 已提交
136 137
    parameter_tar = paddle.dataset.common.download(
        FLUID_URL_MODEL, 'uci_housing', FLUID_MD5_MODEL, 'fit_a_line.fluid.tar')
T
tangwei12 已提交
138 139 140 141 142 143 144

    tar = tarfile.TarFile(parameter_tar, mode='r')
    dirpath = tempfile.mkdtemp()
    tar.extractall(path=dirpath)

    return dirpath

T
tangwei12 已提交
145

146 147 148 149
@deprecated(
    since="2.0.0",
    update_to="paddle.text.datasets.UCIHousing",
    reason="Please use new dataset API which supports paddle.io.DataLoader")
T
tangwei12 已提交
150 151
def predict_reader():
    """
152
    It returns just one tuple data to do inference.
T
tangwei12 已提交
153

154
    :return: one tuple data
M
minqiyang 已提交
155
    :rtype: tuple
T
tangwei12 已提交
156 157 158
    """
    global UCI_TEST_DATA
    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
T
tangwei12 已提交
159
    return (UCI_TEST_DATA[0][:-1], )
Y
Yancey1989 已提交
160

T
tangwei12 已提交
161

162 163 164 165
@deprecated(
    since="2.0.0",
    update_to="paddle.text.datasets.UCIHousing",
    reason="Please use new dataset API which supports paddle.io.DataLoader")
166
def fetch():
167
    paddle.dataset.common.download(URL, 'uci_housing', MD5)