diff --git a/demo/traffic_prediction/README b/demo/traffic_prediction/README new file mode 100644 index 0000000000000000000000000000000000000000..4c95188583513c332b7d7cb0a32d59336208e1aa --- /dev/null +++ b/demo/traffic_prediction/README @@ -0,0 +1,7 @@ +run by: +cd ./data +sh get_data.sh +cd .. +sh train.sh +sh predict.sh + diff --git a/demo/traffic_prediction/data/get_data.sh b/demo/traffic_prediction/data/get_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..f2fa548d4709c0361334f117bfb49e18d83c32f4 --- /dev/null +++ b/demo/traffic_prediction/data/get_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -x + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd $DIR + +#download the dataset +echo "Downloading traffic data..." +wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz + +#extract package +echo "Unzipping..." +tar -zxvf traffic_data.tar.gz + +echo "data/speeds.csv" > train.list +echo "data/speeds.csv" > test.list +echo "data/speeds.csv" > pred.list + +echo "Done." diff --git a/demo/traffic_prediction/dataprovider.py b/demo/traffic_prediction/dataprovider.py new file mode 100644 index 0000000000000000000000000000000000000000..c7883b6950c369ee67c39b80ce1cefbbf9350459 --- /dev/null +++ b/demo/traffic_prediction/dataprovider.py @@ -0,0 +1,82 @@ +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import * +import sys +import numpy as np +TERM_NUM = 24 +FORECASTING_NUM = 24 +LABEL_VALUE_NUM = 4 + + +def initHook(settings, file_list, **kwargs): + """ + Init hook is invoked before process data. It will set obj.slots and store data meta. + + :param settings: global object. It will passed to process routine. + :type obj: object + :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function. + :param kwargs: unused other arguments. + """ + del kwargs #unused + + settings.pool_size = sys.maxint + #Use a time seires of the past as feature. + #Dense_vector's expression form is [float,float,...,float] + settings.input_types = [dense_vector(TERM_NUM)] + #There are next FORECASTING_NUM fragments you need predict. + #Every predicted condition at time point has four states. + for i in range(FORECASTING_NUM): + settings.input_types.append(integer_value(LABEL_VALUE_NUM)) + + +@provider( + init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True) +def process(settings, file_name): + with open(file_name) as f: + #abandon fields name + f.next() + for row_num, line in enumerate(f): + speeds = map(int, line.rstrip('\r\n').split(",")[1:]) + # Get the max index. + end_time = len(speeds) + # Scanning and generating samples + for i in range(TERM_NUM, end_time - FORECASTING_NUM): + # For dense slot + pre_spd = map(float, speeds[i - TERM_NUM:i]) + + # Integer value need predicting, values start from 0, so every one minus 1. + fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]] + + # Predicting label is missing, abandon the sample. + if -1 in fol_spd: + continue + yield [pre_spd] + fol_spd + + +def predict_initHook(settings, file_list, **kwargs): + settings.pool_size = sys.maxint + settings.input_types = [dense_vector(TERM_NUM)] + + +@provider(init_hook=predict_initHook, should_shuffle=False) +def process_predict(settings, file_name): + with open(file_name) as f: + #abandon fields name + f.next() + for row_num, line in enumerate(f): + speeds = map(int, line.rstrip('\r\n').split(",")) + end_time = len(speeds) + pre_spd = map(float, speeds[end_time - TERM_NUM:end_time]) + yield pre_spd diff --git a/demo/traffic_prediction/gen_result.py b/demo/traffic_prediction/gen_result.py new file mode 100644 index 0000000000000000000000000000000000000000..3da70b30315f863fd3582583e9a29540a09c1e7f --- /dev/null +++ b/demo/traffic_prediction/gen_result.py @@ -0,0 +1,61 @@ +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +res = [] +with open('./rank-00000') as f: + for line in f: + pred = map(int, line.strip('\r\n;').split(";")) + #raw prediction range from 0 to 3 + res.append([i + 1 for i in pred]) + +file_name = open('./data/pred.list').read().strip('\r\n') + +FORECASTING_NUM = 24 +header = [ + 'id', + '201604200805', + '201604200810', + '201604200815', + '201604200820', + '201604200825', + '201604200830', + '201604200835', + '201604200840', + '201604200845', + '201604200850', + '201604200855', + '201604200900', + '201604200905', + '201604200910', + '201604200915', + '201604200920', + '201604200925', + '201604200930', + '201604200935', + '201604200940', + '201604200945', + '201604200950', + '201604200955', + '201604201000', +] +################### +## To CSV format ## +################### +with open(file_name) as f: + f.next() + print ','.join(header) + for row_num, line in enumerate(f): + fields = line.rstrip('\r\n').split(',') + linkid = fields[0] + print linkid + ',' + ','.join(map(str, res[row_num])) diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh new file mode 100755 index 0000000000000000000000000000000000000000..cec35dce11d1c146a9e878ebab81abe904d6136c --- /dev/null +++ b/demo/traffic_prediction/predict.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +cfg=trainer_config.py +# pass choice +model="output/pass-00000" +paddle train \ + --config=$cfg \ + --use_gpu=false \ + --job=test \ + --init_model_path=$model \ + --config_args=is_predict=1 \ + --predict_output_dir=. + +python gen_result.py > result.txt + +rm -rf rank-00000 diff --git a/demo/traffic_prediction/train.sh b/demo/traffic_prediction/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..48dfc5604f80042598c5c779bd450a5808fdfb64 --- /dev/null +++ b/demo/traffic_prediction/train.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +cfg=trainer_config.py +paddle train \ + --config=$cfg \ + --save_dir=./output \ + --trainer_count=4 \ + --log_period=1000 \ + --dot_period=10 \ + --num_passes=10 \ + --use_gpu=false \ + --show_parameter_stats_period=3000 \ + 2>&1 | tee 'train.log' diff --git a/demo/traffic_prediction/trainer_config.py b/demo/traffic_prediction/trainer_config.py new file mode 100755 index 0000000000000000000000000000000000000000..52d678624aff7ca2264c3c20e320004217d14397 --- /dev/null +++ b/demo/traffic_prediction/trainer_config.py @@ -0,0 +1,52 @@ +# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle.trainer_config_helpers import * + +################################### DATA Configuration ############################################# +is_predict = get_config_arg('is_predict', bool, False) +trn = './data/train.list' if not is_predict else None +tst = './data/test.list' if not is_predict else './data/pred.list' +process = 'process' if not is_predict else 'process_predict' +define_py_data_sources2( + train_list=trn, test_list=tst, module="dataprovider", obj=process) +################################### Parameter Configuaration ####################################### +TERM_NUM = 24 +FORECASTING_NUM = 24 +emb_size = 16 +batch_size = 128 if not is_predict else 1 +settings( + batch_size=batch_size, + learning_rate=1e-3, + learning_method=RMSPropOptimizer()) +################################### Algorithm Configuration ######################################## + +output_label = [] + +link_encode = data_layer(name='link_encode', size=TERM_NUM) +for i in xrange(FORECASTING_NUM): + # Each task share same weight. + link_param = ParamAttr( + name='_link_vec.w', initial_max=1.0, initial_min=-1.0) + link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param) + score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation()) + if is_predict: + maxid = maxid_layer(score) + output_label.append(maxid) + else: + # Multi-task training. + label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4) + cls = classification_cost( + input=score, name="cost_%dmin" % ((i + 1) * 5), label=label) + output_label.append(cls) +outputs(output_label)