diff --git a/fit_a_line/data/prepare_data.py b/fit_a_line/data/prepare_data.py deleted file mode 100644 index 4a3782752c7964a7203e7e78afe1d36cd003037a..0000000000000000000000000000000000000000 --- a/fit_a_line/data/prepare_data.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from collections import Counter -from urllib2 import urlopen -import argparse -import os -import random -import logging - -import numpy as np - -logging.basicConfig(level=logging.INFO) -data_url = 'https://archive.ics.uci.edu/ml/machine' \ - '-learning-databases/housing/housing.data' -raw_data = 'housing.data' -train_data = 'housing.train.npy' -test_data = 'housing.test.npy' -feature_names = [ - 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', - 'PTRATIO', 'B', 'LSTAT' -] -root_dir = os.path.abspath(os.pardir) - - -def maybe_download(url, file_path): - if not os.path.exists(file_path): - logging.info('data doesn\'t exist on %s, download from [%s]' % - (file_path, url)) - resp = urlopen(url).read() - with open(file_path, 'w') as f: - f.write(resp) - - logging.info('got raw housing data') - - -def save_list(): - with open('train.list', 'w') as f: - f.write('data/' + train_data + '\n') - with open('test.list', 'w') as f: - f.write('data/' + test_data + '\n') - - -def feature_range(maximums, minimums): - import matplotlib - matplotlib.use('Agg') - import matplotlib.pyplot as plt - fig, ax = plt.subplots() - feature_num = len(maximums) - ax.bar(range(feature_num), maximums - minimums, color='r', align='center') - ax.set_title('feature scale') - plt.xticks(range(feature_num), feature_names) - plt.xlim([-1, feature_num]) - fig.set_figheight(6) - fig.set_figwidth(10) - fig.savefig('%s/image/ranges.png' % root_dir, dpi=48) - plt.close(fig) - - -def preprocess(file_path, feature_num=14, shuffle=False, ratio=0.8): - data = np.fromfile(file_path, sep=' ') - data = data.reshape(data.shape[0] / feature_num, feature_num) - maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( - axis=0) / data.shape[0] - feature_range(maximums[:-1], minimums[:-1]) - for i in xrange(feature_num - 1): - data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) - if shuffle: - np.random.shuffle(data) - offset = int(data.shape[0] * ratio) - np.save(train_data, data[:offset]) - logging.info('saved training data to %s' % train_data) - np.save(test_data, data[offset:]) - logging.info('saved test data to %s' % test_data) - save_list() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='download boston housing price data set and preprocess the data(normalization and split dataset)' - ) - parser.add_argument( - '-r', - '--ratio', - dest='ratio', - default='0.8', - help='ratio of data used for training') - parser.add_argument( - '-s', - '--shuffle', - dest='shuffle', - default='0', - choices={'1', '0'}, - help='shuffle the data before splitting, 1=shuffle, 0=do not shuffle') - args = parser.parse_args() - - maybe_download(data_url, raw_data) - preprocess(raw_data, shuffle=int(args.shuffle), ratio=float(args.ratio)) diff --git a/fit_a_line/dataprovider.py b/fit_a_line/dataprovider.py deleted file mode 100644 index f93fe4cafb470c21ac7cf1bd0f34b9fd676856dc..0000000000000000000000000000000000000000 --- a/fit_a_line/dataprovider.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import numpy as np - - -# define data types of input -@provider(input_types=[dense_vector(13), dense_vector(1)]) -def process(settings, input_file): - data = np.load(input_file.strip()) - for row in data: - yield row[:-1].tolist(), row[-1:].tolist() diff --git a/fit_a_line/predict.py b/fit_a_line/predict.py deleted file mode 100644 index 0afbf76099435e8c8680cefa92f12afa53c127a4..0000000000000000000000000000000000000000 --- a/fit_a_line/predict.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import argparse -import numpy as np -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import * -from paddle.trainer.config_parser import parse_config - -logging.basicConfig(level=logging.INFO) - - -def predict(input_file, model_dir): - # prepare PaddlePaddle environment, load models - swig_paddle.initPaddle("--use_gpu=0") - conf = parse_config('trainer_config.py', 'is_predict=1') - network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - network.loadParameters(model_dir) - slots = [dense_vector(13)] - converter = DataProviderConverter(slots) - - data = np.load(input_file) - ys = [] - for row in data: - result = network.forwardTest(converter([[row[:-1].tolist()]])) - y_true = row[-1:].tolist()[0] - y_predict = result[0]['value'][0][0] - ys.append([y_true, y_predict]) - - ys = np.matrix(ys) - avg_err = np.average(np.square((ys[:, 0] - ys[:, 1]))) - logging.info('MSE of test set is %f' % avg_err) - - # draw a scatter plot - import matplotlib - matplotlib.use('Agg') - import matplotlib.pyplot as plt - fig, ax = plt.subplots() - - ax.scatter(ys[:, 0], ys[:, 1]) - y_range = [ys[:, 0].min(), ys[:, 0].max()] - ax.plot(y_range, y_range, 'k--', lw=4) - ax.set_xlabel('True ($1000)') - ax.set_ylabel('Predicted ($1000)') - ax.set_title('Predictions on boston housing price') - fig.savefig('image/predictions.png', dpi=60) - plt.close(fig) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='predict house price and save the result as image.') - parser.add_argument( - '-m', - '--model', - dest='model', - default='output/pass-00029', - help='model path') - parser.add_argument( - '-t', - '--test_data', - dest='test_data', - default='data/housing.test.npy', - help='test data path') - args = parser.parse_args() - - predict(input_file=args.test_data, model_dir=args.model) diff --git a/fit_a_line/train.sh b/fit_a_line/train.sh deleted file mode 100755 index 7fd01321145fb7d0748945e99096dc7c180eb206..0000000000000000000000000000000000000000 --- a/fit_a_line/train.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 diff --git a/fit_a_line/trainer_config.py b/fit_a_line/trainer_config.py deleted file mode 100644 index 347cbcef760111cd1b2f10b30d55b04011e16425..0000000000000000000000000000000000000000 --- a/fit_a_line/trainer_config.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg('is_predict', bool, False) - -# 1. read data -define_py_data_sources2( - train_list='data/train.list', - test_list='data/test.list', - module='dataprovider', - obj='process') - -# 2. learning algorithm -settings(batch_size=2) - -# 3. Network configuration - -x = data_layer(name='x', size=13) - -y_predict = fc_layer( - input=x, - param_attr=ParamAttr(name='w'), - size=1, - act=LinearActivation(), - bias_attr=ParamAttr(name='b')) - -if not is_predict: - y = data_layer(name='y', size=1) - cost = regression_cost(input=y_predict, label=y) - outputs(cost) -else: - outputs(y_predict)