diff --git a/v1_api_demo/mnist/.gitignore b/v1_api_demo/mnist/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f --- /dev/null +++ b/v1_api_demo/mnist/.gitignore @@ -0,0 +1,10 @@ +data/raw_data +data/*.list +mnist_vgg_model +plot.png +train.log +*pyc +.ipynb_checkpoints +params.pkl +params.tar +params.tar.gz diff --git a/v1_api_demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1caa7dd9653a2cc2860ace736fe3d25a3767e0 --- /dev/null +++ b/v1_api_demo/mnist/api_train.py @@ -0,0 +1,196 @@ +""" +A very basic example for how to use current Raw SWIG API to train mnist network. + +Current implementation uses Raw SWIG, which means the API call is directly \ +passed to C++ side of Paddle. + +The user api could be simpler and carefully designed. +""" +import random + +import numpy as np +import paddle.v2 as paddle_v2 +import py_paddle.swig_paddle as api +from paddle.trainer_config_helpers import * +from py_paddle import DataProviderConverter + +from mnist_util import read_from_mnist + + +def init_parameter(network): + assert isinstance(network, api.GradientMachine) + for each_param in network.getParameters(): + assert isinstance(each_param, api.Parameter) + array_size = len(each_param) + array = np.random.uniform(-1.0, 1.0, array_size).astype('float32') + each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array) + + +def generator_to_batch(generator, batch_size): + ret_val = list() + for each_item in generator: + ret_val.append(each_item) + if len(ret_val) == batch_size: + yield ret_val + ret_val = list() + if len(ret_val) != 0: + yield ret_val + + +class BatchPool(object): + def __init__(self, generator, batch_size): + self.data = list(generator) + self.batch_size = batch_size + + def __call__(self): + random.shuffle(self.data) + for offset in xrange(0, len(self.data), self.batch_size): + limit = min(offset + self.batch_size, len(self.data)) + yield self.data[offset:limit] + + +def input_order_converter(generator): + for each_item in generator: + yield each_item['pixel'], each_item['label'] + + +def main(): + api.initPaddle("-use_gpu=false", "-trainer_count=4") # use 4 cpu cores + + optimizer = paddle_v2.optimizer.Adam( + learning_rate=1e-4, + batch_size=1000, + model_average=ModelAverage(average_window=0.5), + regularization=L2Regularization(rate=0.5)) + + # Create Local Updater. Local means not run in cluster. + # For a cluster training, here we can change to createRemoteUpdater + # in future. + updater = optimizer.create_local_updater() + assert isinstance(updater, api.ParameterUpdater) + + # define network + images = paddle_v2.layer.data( + name='pixel', type=paddle_v2.data_type.dense_vector(784)) + label = paddle_v2.layer.data( + name='label', type=paddle_v2.data_type.integer_value(10)) + hidden1 = paddle_v2.layer.fc(input=images, size=200) + hidden2 = paddle_v2.layer.fc(input=hidden1, size=200) + inference = paddle_v2.layer.fc(input=hidden2, + size=10, + act=paddle_v2.activation.Softmax()) + cost = paddle_v2.layer.classification_cost(input=inference, label=label) + + # Create Simple Gradient Machine. + model_config = paddle_v2.layer.parse_network(cost) + m = api.GradientMachine.createFromConfigProto(model_config, + api.CREATE_MODE_NORMAL, + optimizer.enable_types()) + + # This type check is not useful. Only enable type hint in IDE. + # Such as PyCharm + assert isinstance(m, api.GradientMachine) + + # Initialize Parameter by numpy. + init_parameter(network=m) + + # Initialize ParameterUpdater. + updater.init(m) + + # DataProvider Converter is a utility convert Python Object to Paddle C++ + # Input. The input format is as same as Paddle's DataProvider. + converter = DataProviderConverter(input_types=[images.type, label.type]) + + train_file = './data/raw_data/train' + test_file = './data/raw_data/t10k' + + # start gradient machine. + # the gradient machine must be started before invoke forward/backward. + # not just for training, but also for inference. + m.start() + + # evaluator can print error rate, etc. It is a C++ class. + batch_evaluator = m.makeEvaluator() + test_evaluator = m.makeEvaluator() + + # Get Train Data. + # TrainData will stored in a data pool. Currently implementation is not care + # about memory, speed. Just a very naive implementation. + train_data_generator = input_order_converter(read_from_mnist(train_file)) + train_data = BatchPool(train_data_generator, 512) + + # outArgs is Neural Network forward result. Here is not useful, just passed + # to gradient_machine.forward + outArgs = api.Arguments.createArguments(0) + + for pass_id in xrange(2): # we train 2 passes. + updater.startPass() + + for batch_id, data_batch in enumerate(train_data()): + # data_batch is input images. + # here, for online learning, we could get data_batch from network. + + # Start update one batch. + pass_type = updater.startBatch(len(data_batch)) + + # Start BatchEvaluator. + # batch_evaluator can be used between start/finish. + batch_evaluator.start() + + # forwardBackward is a shortcut for forward and backward. + # It is sometimes faster than invoke forward/backward separately, + # because in GradientMachine, it may be async. + m.forwardBackward(converter(data_batch), outArgs, pass_type) + + for each_param in m.getParameters(): + updater.update(each_param) + + # Get cost. We use numpy to calculate total cost for this batch. + cost_vec = outArgs.getSlotValue(0) + cost_vec = cost_vec.copyToNumpyMat() + cost = cost_vec.sum() / len(data_batch) + + # Make evaluator works. + m.eval(batch_evaluator) + + # Print logs. + print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \ + cost, batch_evaluator + + batch_evaluator.finish() + # Finish batch. + # * will clear gradient. + # * ensure all values should be updated. + updater.finishBatch(cost) + + # testing stage. use test data set to test current network. + updater.apply() + test_evaluator.start() + test_data_generator = input_order_converter(read_from_mnist(test_file)) + for data_batch in generator_to_batch(test_data_generator, 512): + # in testing stage, only forward is needed. + m.forward(converter(data_batch), outArgs, api.PASS_TEST) + m.eval(test_evaluator) + + # print error rate for test data set + print 'Pass', pass_id, ' test evaluator: ', test_evaluator + test_evaluator.finish() + updater.restore() + + updater.catchUpWith() + params = m.getParameters() + for each_param in params: + assert isinstance(each_param, api.Parameter) + value = each_param.getBuf(api.PARAMETER_VALUE) + value = value.copyToNumpyArray() + + # Here, we could save parameter to every where you want + print each_param.getName(), value + + updater.finishPass() + + m.finish() + + +if __name__ == '__main__': + main() diff --git a/v1_api_demo/mnist/data/generate_list.py b/v1_api_demo/mnist/data/generate_list.py new file mode 100644 index 0000000000000000000000000000000000000000..49981cc7a93308bc96ad5097eba749440e958525 --- /dev/null +++ b/v1_api_demo/mnist/data/generate_list.py @@ -0,0 +1,21 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +o = open("./" + "train.list", "w") +o.write("./data/raw_data/train" + "\n") +o.close() + +o = open("./" + "test.list", "w") +o.write("./data/raw_data/t10k" + "\n") +o.close() diff --git a/v1_api_demo/mnist/data/get_mnist_data.sh b/v1_api_demo/mnist/data/get_mnist_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..5a2e34026d4fe7f8315d4f5453bec7c4ee4f6885 --- /dev/null +++ b/v1_api_demo/mnist/data/get_mnist_data.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env sh +# This scripts downloads the mnist data and unzips it. +set -e +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +rm -rf "$DIR/raw_data" +mkdir "$DIR/raw_data" +cd "$DIR/raw_data" + +echo "Downloading..." + +for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte +do + if [ ! -e $fname ]; then + wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz + gunzip ${fname}.gz + fi +done + +cd $DIR +rm -f *.list +python generate_list.py diff --git a/v1_api_demo/mnist/light_mnist.py b/v1_api_demo/mnist/light_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..33409054357d2f0c6a765b3ab3164eb2e584467e --- /dev/null +++ b/v1_api_demo/mnist/light_mnist.py @@ -0,0 +1,79 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +is_predict = get_config_arg("is_predict", bool, False) + +####################Data Configuration ################## + +if not is_predict: + data_dir = './data/' + define_py_data_sources2( + train_list=data_dir + 'train.list', + test_list=data_dir + 'test.list', + module='mnist_provider', + obj='process') + +######################Algorithm Configuration ############# +settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer()) + +#######################Network Configuration ############# + +data_size = 1 * 28 * 28 +label_size = 10 +img = data_layer(name='pixel', size=data_size) + + +# light cnn +# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1 +# Easier to train for mnist dataset and quite efficient +# Final performance is close to deeper ones on tasks such as digital and character classification +def light_cnn(input_image, num_channels, num_classes): + def __light__(ipt, + num_filter=128, + times=1, + conv_filter_size=3, + dropouts=0, + num_channels_=None): + return img_conv_group( + input=ipt, + num_channels=num_channels_, + pool_size=2, + pool_stride=2, + conv_padding=0, + conv_num_filter=[num_filter] * times, + conv_filter_size=conv_filter_size, + conv_act=ReluActivation(), + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type=MaxPooling()) + + tmp = __light__(input_image, num_filter=128, num_channels_=num_channels) + tmp = __light__(tmp, num_filter=128) + tmp = __light__(tmp, num_filter=128) + tmp = __light__(tmp, num_filter=128, conv_filter_size=1) + + tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation()) + return tmp + + +predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size) + +if not is_predict: + lbl = data_layer(name="label", size=label_size) + inputs(img, lbl) + outputs(classification_cost(input=predict, label=lbl)) +else: + outputs(predict) diff --git a/v1_api_demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..888cfef1e7e3e1b4f556756c003eeb23e741cabe --- /dev/null +++ b/v1_api_demo/mnist/mnist_provider.py @@ -0,0 +1,12 @@ +from paddle.trainer.PyDataProvider2 import * +from mnist_util import read_from_mnist + + +# Define a py data provider +@provider( + input_types={'pixel': dense_vector(28 * 28), + 'label': integer_value(10)}, + cache=CacheType.CACHE_PASS_IN_MEM) +def process(settings, filename): # settings is not used currently. + for each in read_from_mnist(filename): + yield each diff --git a/v1_api_demo/mnist/mnist_util.py b/v1_api_demo/mnist/mnist_util.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd88ae7edc821296ca0accbf6dedc083e411744 --- /dev/null +++ b/v1_api_demo/mnist/mnist_util.py @@ -0,0 +1,30 @@ +import numpy + +__all__ = ['read_from_mnist'] + + +def read_from_mnist(filename): + imgf = filename + "-images-idx3-ubyte" + labelf = filename + "-labels-idx1-ubyte" + f = open(imgf, "rb") + l = open(labelf, "rb") + + f.read(16) + l.read(8) + + # Define number of samples for train/test + if "train" in filename: + n = 60000 + else: + n = 10000 + + images = numpy.fromfile( + f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') + images = images / 255.0 * 2.0 - 1.0 + labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") + + for i in xrange(n): + yield {"pixel": images[i, :], 'label': labels[i]} + + f.close() + l.close() diff --git a/v1_api_demo/mnist/train.sh b/v1_api_demo/mnist/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..ca2b1ad9eb960685b95b0f294a9b929e1a4acab1 --- /dev/null +++ b/v1_api_demo/mnist/train.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +config=vgg_16_mnist.py +output=./mnist_vgg_model +log=train.log + +paddle train \ +--config=$config \ +--dot_period=10 \ +--log_period=100 \ +--test_all_data_in_one_period=1 \ +--use_gpu=0 \ +--trainer_count=1 \ +--num_passes=100 \ +--save_dir=$output \ +2>&1 | tee $log +paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1 + +python -m paddle.utils.plotcurve -i $log > plot.png diff --git a/v1_api_demo/mnist/vgg_16_mnist.py b/v1_api_demo/mnist/vgg_16_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..a819b391c690fb473801eb2e7ba3161cc31b5b4b --- /dev/null +++ b/v1_api_demo/mnist/vgg_16_mnist.py @@ -0,0 +1,50 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +is_predict = get_config_arg("is_predict", bool, False) + +####################Data Configuration ################## + +if not is_predict: + data_dir = './data/' + define_py_data_sources2( + train_list=data_dir + 'train.list', + test_list=data_dir + 'test.list', + module='mnist_provider', + obj='process') + +######################Algorithm Configuration ############# +settings( + batch_size=128, + learning_rate=0.1 / 128.0, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * 128)) + +#######################Network Configuration ############# + +data_size = 1 * 28 * 28 +label_size = 10 +img = data_layer(name='pixel', size=data_size) + +# small_vgg is predined in trainer_config_helpers.network +predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size) + +if not is_predict: + lbl = data_layer(name="label", size=label_size) + inputs(img, lbl) + outputs(classification_cost(input=predict, label=lbl)) +else: + outputs(predict)