From 411e234808ff9d0e23d2a65faf8a63cab71f3b52 Mon Sep 17 00:00:00 2001 From: chengxingyi Date: Fri, 6 Jan 2017 16:22:32 +0800 Subject: [PATCH] A traffic demo for ASC17 --- demo/traffic_prediction/README | 7 +++ demo/traffic_prediction/data/get_data.sh | 34 ++++++++++ demo/traffic_prediction/dataprovider.py | 77 +++++++++++++++++++++++ demo/traffic_prediction/gen_result.py | 47 ++++++++++++++ demo/traffic_prediction/predict.sh | 30 +++++++++ demo/traffic_prediction/train.sh | 30 +++++++++ demo/traffic_prediction/trainer_config.py | 43 +++++++++++++ 7 files changed, 268 insertions(+) create mode 100644 demo/traffic_prediction/README create mode 100755 demo/traffic_prediction/data/get_data.sh create mode 100644 demo/traffic_prediction/dataprovider.py create mode 100644 demo/traffic_prediction/gen_result.py create mode 100755 demo/traffic_prediction/predict.sh create mode 100755 demo/traffic_prediction/train.sh create mode 100755 demo/traffic_prediction/trainer_config.py diff --git a/demo/traffic_prediction/README b/demo/traffic_prediction/README new file mode 100644 index 00000000000..4c951885835 --- /dev/null +++ b/demo/traffic_prediction/README @@ -0,0 +1,7 @@ +run by: +cd ./data +sh get_data.sh +cd .. +sh train.sh +sh predict.sh + diff --git a/demo/traffic_prediction/data/get_data.sh b/demo/traffic_prediction/data/get_data.sh new file mode 100755 index 00000000000..52cf6608df8 --- /dev/null +++ b/demo/traffic_prediction/data/get_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -x + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd $DIR + +#download the dataset +echo "Downloading traffic data..." +wget http://paddlepaddle.bj.bcebos.com/demo/traffic/traffic_data.tar.gz + +#extract package +echo "Unzipping..." +tar -zxvf traffic_data.tar.gz + +echo "data/speeds.csv" >> train.list +echo "data/speeds.csv" >> test.list +echo "data/speeds.csv" >> pred.list + +echo "Done." diff --git a/demo/traffic_prediction/dataprovider.py b/demo/traffic_prediction/dataprovider.py new file mode 100644 index 00000000000..bea0259b031 --- /dev/null +++ b/demo/traffic_prediction/dataprovider.py @@ -0,0 +1,77 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import * +import sys +import numpy as np +TERM_NUM = 24 +FORECASTING_NUM = 25 +LABEL_VALUE_NUM = 4 +def initHook(settings, file_list, **kwargs): + """ + Init hook is invoked before process data. It will set obj.slots and store data meta. + + :param settings: global object. It will passed to process routine. + :type obj: object + :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function. + :param kwargs: unused other arguments. + """ + del kwargs #unused + + settings.pool_size = sys.maxint + #Use a time seires of the past as feature. + #Dense_vector's expression form is [float,float,...,float] + settings.slots = [dense_vector(TERM_NUM)] + #There are next FORECASTING_NUM fragments you need predict. + #Every predicted condition at time point has four states. + for i in range(FORECASTING_NUM): + settings.slots.append(integer_value(LABEL_VALUE_NUM)) + +@provider(init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True) +def process(settings, file_name): + with open(file_name) as f: + #abandon fields name + f.next() + for row_num, line in enumerate(f): + speeds = map(int,line.rstrip('\r\n').split(",")[1:]) + # Get the max index. + end_time = len(speeds) + # Scanning and generating samples + for i in range(TERM_NUM,end_time - FORECASTING_NUM): + # For dense slot + pre_spd = map(float,speeds[i-TERM_NUM:i]) + + # Integer value need predicting, values start from 0, so every one minus 1. + fol_spd = [i-1 for i in speeds[i:i + FORECASTING_NUM]] + + # Predicting label is missing, abandon the sample. + if -1 in fol_spd: + continue + yield [pre_spd] + fol_spd + +def predict_initHook(settings, file_list, **kwargs): + settings.pool_size = sys.maxint + settings.slots = [dense_vector(TERM_NUM)] + +@provider(init_hook=predict_initHook,should_shuffle=False) +def process_predict(settings, file_name): + with open(file_name) as f: + #abandon fields name + f.next() + for row_num, line in enumerate(f): + speeds = map(int,line.rstrip('\r\n').split(",")) + end_time = len(speeds) + pre_spd = map(float,speeds[end_time-TERM_NUM:end_time]) + yield pre_spd + diff --git a/demo/traffic_prediction/gen_result.py b/demo/traffic_prediction/gen_result.py new file mode 100644 index 00000000000..78e5bd70033 --- /dev/null +++ b/demo/traffic_prediction/gen_result.py @@ -0,0 +1,47 @@ +res = [] +with open('./rank-00000') as f: + for line in f: + pred = map(int,line.strip('\r\n;').split(";")) + #raw prediction range from 0 to 3 + res.append([i+1 for i in pred]) + +file_name = open('./data/pred.list').read().strip('\r\n') + +FORECASTING_NUM=24 +header=['id', + '201604200805', + '201604200810', + '201604200815', + '201604200820', + '201604200825', + '201604200830', + '201604200835', + '201604200840', + '201604200845', + '201604200850', + '201604200855', + '201604200900', + '201604200905', + '201604200910', + '201604200915', + '201604200920', + '201604200925', + '201604200930', + '201604200935', + '201604200940', + '201604200945', + '201604200950', + '201604200955', + '201604201000', + ] +################### +## To CSV format ## +################### +with open(file_name) as f: + f.next() + print ','.join(header) + for row_num, line in enumerate(f): + fields = line.rstrip('\r\n').split(',') + linkid = fields[0] + print linkid+','+','.join(map(str,res[row_num])) + diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh new file mode 100755 index 00000000000..2cc709f1099 --- /dev/null +++ b/demo/traffic_prediction/predict.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +cfg=trainer_config.py +# pass choice +model="output/pass-00000" +paddle train \ + --config=$cfg \ + --use_gpu=false \ + --job=test \ + --init_model_path=$model \ + --config_args=is_predict=1 \ + --predict_output_dir=. + +python gen_result.py > result.txt + +rm -rf rank-00000 diff --git a/demo/traffic_prediction/train.sh b/demo/traffic_prediction/train.sh new file mode 100755 index 00000000000..bd1a1036b84 --- /dev/null +++ b/demo/traffic_prediction/train.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +cfg=trainer_config.py +#TRAINER_BIN="./paddle_trainer" +paddle train \ + --config=$cfg \ + --save_dir=./output \ + --trainer_count=4 \ + --log_period=1000 \ + --dot_period=10 \ + --num_passes=10 \ + --use_gpu=false \ + --show_parameter_stats_period=3000 \ + --test_wait=1 + #--test_all_data_in_one_period=1 \ + 2>&1 | tee 'train.log' diff --git a/demo/traffic_prediction/trainer_config.py b/demo/traffic_prediction/trainer_config.py new file mode 100755 index 00000000000..835b1d688cd --- /dev/null +++ b/demo/traffic_prediction/trainer_config.py @@ -0,0 +1,43 @@ +#!/usr/bin/env/python +#-*python-*- +from paddle.trainer_config_helpers import * + + +################################### DATA Configuration ############################################# +is_predict = get_config_arg('is_predict', bool, False) +trn = './data/train.list' if not is_predict else None +tst = './data/test.list' if not is_predict else './data/pred.list' +process = 'process' if not is_predict else 'process_predict' +define_py_data_sources2(train_list=trn, + test_list=tst, + module="dataprovider", + obj=process) +################################### Parameter Configuaration ####################################### +TERM_NUM=24 +FORECASTING_NUM= 25 +emb_size=16 +batch_size=128 if not is_predict else 1 +settings( + batch_size = batch_size, + learning_rate = 1e-3, + learning_method = RMSPropOptimizer() +) +################################### Algorithm Configuration ######################################## + +output_label = [] + +link_encode = data_layer(name='link_encode', size=TERM_NUM) +for i in xrange(FORECASTING_NUM): + # Each task share same weight. + link_param = ParamAttr(name='_link_vec.w', initial_max=1.0, initial_min=-1.0) + link_vec = fc_layer(input=link_encode,size=emb_size, param_attr=link_param) + score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation()) + if is_predict: + maxid = maxid_layer(score) + output_label.append(maxid) + else: + # Multi-task training. + label = data_layer(name='label_%dmin'%((i+1)*5), size=4) + cls = classification_cost(input=score,name="cost_%dmin"%((i+1)*5), label=label) + output_label.append(cls) +outputs(output_label) -- GitLab