diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md deleted file mode 100644 index 0460a85fae078800332982751a5d4a9644c50bd6..0000000000000000000000000000000000000000 --- a/v1_api_demo/README.md +++ /dev/null @@ -1,5 +0,0 @@ -The examples in v1_api_demo are using v1_api currently, and will be upgraded to v2_api later. -Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future. - -Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and -[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle. diff --git a/v1_api_demo/gan/.gitignore b/v1_api_demo/gan/.gitignore deleted file mode 100644 index 93a6f5080a16a601cffb0bff51af9aef3ba3bae7..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -output/ -uniform_params/ -cifar_params/ -mnist_params/ -*.png -.pydevproject -.project -*.log -*.pyc -data/mnist_data/ -data/cifar-10-batches-py/ diff --git a/v1_api_demo/gan/README.md b/v1_api_demo/gan/README.md deleted file mode 100644 index 1908b534b0c1f63904d5503399b961d74ce0037c..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Generative Adversarial Networks (GAN) - -This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434). - -The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data). - -In order to run the model, first download the corresponding data by running the shell script in ./data. -Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu). - -$python gan_trainer.py -d cifar --use_gpu 1 - -The generated images will be stored in ./cifar_samples/ -The corresponding models will be stored in ./cifar_params/ diff --git a/v1_api_demo/gan/data/download_cifar.sh b/v1_api_demo/gan/data/download_cifar.sh deleted file mode 100755 index bbadc7c10c73e45a0948018b8812f79040d14bc4..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/data/download_cifar.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -tar zxf cifar-10-python.tar.gz -rm cifar-10-python.tar.gz diff --git a/v1_api_demo/gan/data/get_mnist_data.sh b/v1_api_demo/gan/data/get_mnist_data.sh deleted file mode 100755 index a77c81bf5af9ddb6634ff89460797ca543c5e517..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/data/get_mnist_data.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env sh -# This script downloads the mnist data and unzips it. -set -e -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -rm -rf "$DIR/mnist_data" -mkdir "$DIR/mnist_data" -cd "$DIR/mnist_data" - -echo "Downloading..." - -for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte -do - if [ ! -e $fname ]; then - wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz - gunzip ${fname}.gz - fi -done diff --git a/v1_api_demo/gan/gan_conf.py b/v1_api_demo/gan/gan_conf.py deleted file mode 100644 index 86ac2dffe5f4490a88e12d1fa5e8cd9fa61a69f4..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/gan_conf.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer_config_helpers import * - -mode = get_config_arg("mode", str, "generator") -assert mode in set([ - "generator", "discriminator", "generator_training", "discriminator_training" -]) - -is_generator_training = mode == "generator_training" -is_discriminator_training = mode == "discriminator_training" -is_generator = mode == "generator" -is_discriminator = mode == "discriminator" - -# The network structure below follows the ref https://arxiv.org/abs/1406.2661 -# Here we used two hidden layers and batch_norm - -print('mode=%s' % mode) -# the dim of the noise (z) as the input of the generator network -noise_dim = 10 -# the dim of the hidden layer -hidden_dim = 10 -# the dim of the generated sample -sample_dim = 2 - -settings( - batch_size=128, - learning_rate=1e-4, - learning_method=AdamOptimizer(beta1=0.5)) - - -def discriminator(sample): - """ - discriminator ouputs the probablity of a sample is from generator - or real data. - The output has two dimenstional: dimension 0 is the probablity - of the sample is from generator and dimension 1 is the probabblity - of the sample is from real data. - """ - param_attr = ParamAttr(is_static=is_generator_training) - bias_attr = ParamAttr( - is_static=is_generator_training, initial_mean=1.0, initial_std=0) - - hidden = fc_layer( - input=sample, - name="dis_hidden", - size=hidden_dim, - bias_attr=bias_attr, - param_attr=param_attr, - act=ReluActivation()) - - hidden2 = fc_layer( - input=hidden, - name="dis_hidden2", - size=hidden_dim, - bias_attr=bias_attr, - param_attr=param_attr, - act=LinearActivation()) - - hidden_bn = batch_norm_layer( - hidden2, - act=ReluActivation(), - name="dis_hidden_bn", - bias_attr=bias_attr, - param_attr=ParamAttr( - is_static=is_generator_training, initial_mean=1.0, - initial_std=0.02), - use_global_stats=False) - - return fc_layer( - input=hidden_bn, - name="dis_prob", - size=2, - bias_attr=bias_attr, - param_attr=param_attr, - act=SoftmaxActivation()) - - -def generator(noise): - """ - generator generates a sample given noise - """ - param_attr = ParamAttr(is_static=is_discriminator_training) - bias_attr = ParamAttr( - is_static=is_discriminator_training, initial_mean=1.0, initial_std=0) - - hidden = fc_layer( - input=noise, - name="gen_layer_hidden", - size=hidden_dim, - bias_attr=bias_attr, - param_attr=param_attr, - act=ReluActivation()) - - hidden2 = fc_layer( - input=hidden, - name="gen_hidden2", - size=hidden_dim, - bias_attr=bias_attr, - param_attr=param_attr, - act=LinearActivation()) - - hidden_bn = batch_norm_layer( - hidden2, - act=ReluActivation(), - name="gen_layer_hidden_bn", - bias_attr=bias_attr, - param_attr=ParamAttr( - is_static=is_discriminator_training, - initial_mean=1.0, - initial_std=0.02), - use_global_stats=False) - - return fc_layer( - input=hidden_bn, - name="gen_layer1", - size=sample_dim, - bias_attr=bias_attr, - param_attr=param_attr, - act=LinearActivation()) - - -if is_generator_training: - noise = data_layer(name="noise", size=noise_dim) - sample = generator(noise) - -if is_discriminator_training: - sample = data_layer(name="sample", size=sample_dim) - -if is_generator_training or is_discriminator_training: - label = data_layer(name="label", size=1) - prob = discriminator(sample) - cost = cross_entropy(input=prob, label=label) - classification_error_evaluator( - input=prob, label=label, name=mode + '_error') - outputs(cost) - -if is_generator: - noise = data_layer(name="noise", size=noise_dim) - outputs(generator(noise)) diff --git a/v1_api_demo/gan/gan_conf_image.py b/v1_api_demo/gan/gan_conf_image.py deleted file mode 100644 index c469227994c1a84d1aa73e03bbc74ebeac41d30e..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/gan_conf_image.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer_config_helpers import * - -mode = get_config_arg("mode", str, "generator") -dataSource = get_config_arg("data", str, "mnist") -assert mode in set([ - "generator", "discriminator", "generator_training", "discriminator_training" -]) - -is_generator_training = mode == "generator_training" -is_discriminator_training = mode == "discriminator_training" -is_generator = mode == "generator" -is_discriminator = mode == "discriminator" - -# The network structure below follows the dcgan paper -# (https://arxiv.org/abs/1511.06434) - -print('mode=%s' % mode) -# the dim of the noise (z) as the input of the generator network -noise_dim = 100 -# the number of filters in the layer in generator/discriminator that is -# closet to the image -gf_dim = 64 -df_dim = 64 -if dataSource == "mnist": - sample_dim = 28 # image dim - c_dim = 1 # image color -else: - sample_dim = 32 - c_dim = 3 -s2, s4 = int(sample_dim / 2), int(sample_dim / 4), -s8, s16 = int(sample_dim / 8), int(sample_dim / 16) - -settings( - batch_size=128, - learning_rate=2e-4, - learning_method=AdamOptimizer(beta1=0.5)) - - -def conv_bn(input, - channels, - imgSize, - num_filters, - output_x, - stride, - name, - param_attr, - bias_attr, - param_attr_bn, - bn, - trans=False, - act=ReluActivation()): - """ - conv_bn is a utility function that constructs a convolution/deconv layer - with an optional batch_norm layer - - :param bn: whether to use batch_norm_layer - :type bn: bool - :param trans: whether to use conv (False) or deconv (True) - :type trans: bool - """ - - # calculate the filter_size and padding size based on the given - # imgSize and ouput size - tmp = imgSize - (output_x - 1) * stride - if tmp <= 1 or tmp > 5: - raise ValueError("conv input-output dimension does not fit") - elif tmp <= 3: - filter_size = tmp + 2 - padding = 1 - else: - filter_size = tmp - padding = 0 - - print(imgSize, output_x, stride, filter_size, padding) - - if trans: - nameApx = "_convt" - else: - nameApx = "_conv" - - if bn: - conv = img_conv_layer( - input, - filter_size=filter_size, - num_filters=num_filters, - name=name + nameApx, - num_channels=channels, - act=LinearActivation(), - groups=1, - stride=stride, - padding=padding, - bias_attr=bias_attr, - param_attr=param_attr, - shared_biases=True, - layer_attr=None, - filter_size_y=None, - stride_y=None, - padding_y=None, - trans=trans) - - conv_bn = batch_norm_layer( - conv, - act=act, - name=name + nameApx + "_bn", - bias_attr=bias_attr, - param_attr=param_attr_bn, - use_global_stats=False) - - return conv_bn - else: - conv = img_conv_layer( - input, - filter_size=filter_size, - num_filters=num_filters, - name=name + nameApx, - num_channels=channels, - act=act, - groups=1, - stride=stride, - padding=padding, - bias_attr=bias_attr, - param_attr=param_attr, - shared_biases=True, - layer_attr=None, - filter_size_y=None, - stride_y=None, - padding_y=None, - trans=trans) - return conv - - -def generator(noise): - """ - generator generates a sample given noise - """ - param_attr = ParamAttr( - is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02) - bias_attr = ParamAttr( - is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0) - - param_attr_bn = ParamAttr( - is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02) - - h1 = fc_layer( - input=noise, - name="gen_layer_h1", - size=s8 * s8 * gf_dim * 4, - bias_attr=bias_attr, - param_attr=param_attr, - act=LinearActivation()) - - h1_bn = batch_norm_layer( - h1, - act=ReluActivation(), - name="gen_layer_h1_bn", - bias_attr=bias_attr, - param_attr=param_attr_bn, - use_global_stats=False) - - h2_bn = conv_bn( - h1_bn, - channels=gf_dim * 4, - output_x=s8, - num_filters=gf_dim * 2, - imgSize=s4, - stride=2, - name="gen_layer_h2", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=True, - trans=True) - - h3_bn = conv_bn( - h2_bn, - channels=gf_dim * 2, - output_x=s4, - num_filters=gf_dim, - imgSize=s2, - stride=2, - name="gen_layer_h3", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=True, - trans=True) - - return conv_bn( - h3_bn, - channels=gf_dim, - output_x=s2, - num_filters=c_dim, - imgSize=sample_dim, - stride=2, - name="gen_layer_h4", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=False, - trans=True, - act=TanhActivation()) - - -def discriminator(sample): - """ - discriminator ouputs the probablity of a sample is from generator - or real data. - The output has two dimenstional: dimension 0 is the probablity - of the sample is from generator and dimension 1 is the probabblity - of the sample is from real data. - """ - param_attr = ParamAttr( - is_static=is_generator_training, initial_mean=0.0, initial_std=0.02) - bias_attr = ParamAttr( - is_static=is_generator_training, initial_mean=0.0, initial_std=0.0) - - param_attr_bn = ParamAttr( - is_static=is_generator_training, initial_mean=1.0, initial_std=0.02) - - h0 = conv_bn( - sample, - channels=c_dim, - imgSize=sample_dim, - num_filters=df_dim, - output_x=s2, - stride=2, - name="dis_h0", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=False) - - h1_bn = conv_bn( - h0, - channels=df_dim, - imgSize=s2, - num_filters=df_dim * 2, - output_x=s4, - stride=2, - name="dis_h1", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=True) - - h2_bn = conv_bn( - h1_bn, - channels=df_dim * 2, - imgSize=s4, - num_filters=df_dim * 4, - output_x=s8, - stride=2, - name="dis_h2", - param_attr=param_attr, - bias_attr=bias_attr, - param_attr_bn=param_attr_bn, - bn=True) - - return fc_layer( - input=h2_bn, - name="dis_prob", - size=2, - bias_attr=bias_attr, - param_attr=param_attr, - act=SoftmaxActivation()) - - -if is_generator_training: - noise = data_layer(name="noise", size=noise_dim) - sample = generator(noise) - -if is_discriminator_training: - sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim) - -if is_generator_training or is_discriminator_training: - label = data_layer(name="label", size=1) - prob = discriminator(sample) - cost = cross_entropy(input=prob, label=label) - classification_error_evaluator( - input=prob, label=label, name=mode + '_error') - outputs(cost) - -if is_generator: - noise = data_layer(name="noise", size=noise_dim) - outputs(generator(noise)) diff --git a/v1_api_demo/gan/gan_trainer.py b/v1_api_demo/gan/gan_trainer.py deleted file mode 100644 index 4a26c230f7a21cc6dd4a3cdb52e32730b1ce73ca..0000000000000000000000000000000000000000 --- a/v1_api_demo/gan/gan_trainer.py +++ /dev/null @@ -1,349 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import random -import numpy -import cPickle -import sys, os -from PIL import Image - -from paddle.trainer.config_parser import parse_config -from paddle.trainer.config_parser import logger -import py_paddle.swig_paddle as api -import matplotlib.pyplot as plt - - -def plot2DScatter(data, outputfile): - ''' - Plot the data as a 2D scatter plot and save to outputfile - data needs to be two dimensinoal - ''' - x = data[:, 0] - y = data[:, 1] - logger.info("The mean vector is %s" % numpy.mean(data, 0)) - logger.info("The std vector is %s" % numpy.std(data, 0)) - - heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50) - extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] - - plt.clf() - plt.scatter(x, y) - plt.savefig(outputfile, bbox_inches='tight') - - -def CHECK_EQ(a, b): - assert a == b, "a=%s, b=%s" % (a, b) - - -def copy_shared_parameters(src, dst): - ''' - copy the parameters from src to dst - :param src: the source of the parameters - :type src: GradientMachine - :param dst: the destination of the parameters - :type dst: GradientMachine - ''' - src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())] - src_params = dict([(p.getName(), p) for p in src_params]) - - for i in xrange(dst.getParameterSize()): - dst_param = dst.getParameter(i) - src_param = src_params.get(dst_param.getName(), None) - if src_param is None: - continue - src_value = src_param.getBuf(api.PARAMETER_VALUE) - dst_value = dst_param.getBuf(api.PARAMETER_VALUE) - CHECK_EQ(len(src_value), len(dst_value)) - dst_value.copyFrom(src_value) - dst_param.setValueUpdated() - - -def print_parameters(src): - src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())] - - print "***************" - for p in src_params: - print "Name is %s" % p.getName() - print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray( - ) - - -def load_mnist_data(imageFile): - f = open(imageFile, "rb") - f.read(16) - - # Define number of samples for train/test - if "train" in imageFile: - n = 60000 - else: - n = 10000 - - data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)) - data = data / 255.0 * 2.0 - 1.0 - - f.close() - return data.astype('float32') - - -def load_cifar_data(cifar_path): - batch_size = 10000 - data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32") - for i in range(1, 6): - file = cifar_path + "/data_batch_" + str(i) - fo = open(file, 'rb') - dict = cPickle.load(fo) - fo.close() - data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"] - - data = data / 255.0 * 2.0 - 1.0 - return data - - -# synthesize 2-D uniform data -def load_uniform_data(): - data = numpy.random.rand(1000000, 2).astype('float32') - return data - - -def merge(images, size): - if images.shape[1] == 28 * 28: - h, w, c = 28, 28, 1 - else: - h, w, c = 32, 32, 3 - img = numpy.zeros((h * size[0], w * size[1], c)) - for idx in xrange(size[0] * size[1]): - i = idx % size[1] - j = idx // size[1] - img[j*h:j*h+h, i*w:i*w+w, :] = \ - ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0) - return img.astype('uint8') - - -def save_images(images, path): - merged_img = merge(images, [8, 8]) - if merged_img.shape[2] == 1: - im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB') - else: - im = Image.fromarray(merged_img, mode="RGB") - im.save(path) - - -def get_real_samples(batch_size, data_np): - return data_np[numpy.random.choice( - data_np.shape[0], batch_size, replace=False), :] - - -def get_noise(batch_size, noise_dim): - return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32') - - -def get_fake_samples(generator_machine, batch_size, noise): - gen_inputs = api.Arguments.createArguments(1) - gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise)) - gen_outputs = api.Arguments.createArguments(0) - generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST) - fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat() - return fake_samples - - -def get_training_loss(training_machine, inputs): - outputs = api.Arguments.createArguments(0) - training_machine.forward(inputs, outputs, api.PASS_TEST) - loss = outputs.getSlotValue(0).copyToNumpyMat() - return numpy.mean(loss) - - -def prepare_discriminator_data_batch_pos(batch_size, data_np): - real_samples = get_real_samples(batch_size, data_np) - labels = numpy.ones(batch_size, dtype='int32') - inputs = api.Arguments.createArguments(2) - inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples)) - inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels)) - return inputs - - -def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise): - fake_samples = get_fake_samples(generator_machine, batch_size, noise) - labels = numpy.zeros(batch_size, dtype='int32') - inputs = api.Arguments.createArguments(2) - inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples)) - inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels)) - return inputs - - -def prepare_generator_data_batch(batch_size, noise): - label = numpy.ones(batch_size, dtype='int32') - inputs = api.Arguments.createArguments(2) - inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise)) - inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label)) - return inputs - - -def find(iterable, cond): - for item in iterable: - if cond(item): - return item - return None - - -def get_layer_size(model_conf, layer_name): - layer_conf = find(model_conf.layers, lambda x: x.name == layer_name) - assert layer_conf is not None, "Cannot find '%s' layer" % layer_name - return layer_conf.size - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform") - parser.add_argument( - "--use_gpu", default="1", help="1 means use gpu for training") - parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter") - args = parser.parse_args() - data_source = args.data_source - use_gpu = args.use_gpu - assert data_source in ["mnist", "cifar", "uniform"] - assert use_gpu in ["0", "1"] - - if not os.path.exists("./%s_samples/" % data_source): - os.makedirs("./%s_samples/" % data_source) - - if not os.path.exists("./%s_params/" % data_source): - os.makedirs("./%s_params/" % data_source) - - api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', - '--log_period=100', '--gpu_id=' + args.gpu_id, - '--save_dir=' + "./%s_params/" % data_source) - - if data_source == "uniform": - conf = "gan_conf.py" - num_iter = 10000 - else: - conf = "gan_conf_image.py" - num_iter = 1000 - - gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source) - dis_conf = parse_config(conf, - "mode=discriminator_training,data=" + data_source) - generator_conf = parse_config(conf, "mode=generator,data=" + data_source) - batch_size = dis_conf.opt_config.batch_size - noise_dim = get_layer_size(gen_conf.model_config, "noise") - - if data_source == "mnist": - data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte") - elif data_source == "cifar": - data_np = load_cifar_data("./data/cifar-10-batches-py/") - else: - data_np = load_uniform_data() - - # this creates a gradient machine for discriminator - dis_training_machine = api.GradientMachine.createFromConfigProto( - dis_conf.model_config) - # this create a gradient machine for generator - gen_training_machine = api.GradientMachine.createFromConfigProto( - gen_conf.model_config) - - # generator_machine is used to generate data only, which is used for - # training discriminator - logger.info(str(generator_conf.model_config)) - generator_machine = api.GradientMachine.createFromConfigProto( - generator_conf.model_config) - - dis_trainer = api.Trainer.create(dis_conf, dis_training_machine) - - gen_trainer = api.Trainer.create(gen_conf, gen_training_machine) - - dis_trainer.startTrain() - gen_trainer.startTrain() - - # Sync parameters between networks (GradientMachine) at the beginning - copy_shared_parameters(gen_training_machine, dis_training_machine) - copy_shared_parameters(gen_training_machine, generator_machine) - - # constrain that either discriminator or generator can not be trained - # consecutively more than MAX_strike times - curr_train = "dis" - curr_strike = 0 - MAX_strike = 5 - - for train_pass in xrange(100): - dis_trainer.startTrainPass() - gen_trainer.startTrainPass() - for i in xrange(num_iter): - # Do forward pass in discriminator to get the dis_loss - noise = get_noise(batch_size, noise_dim) - data_batch_dis_pos = prepare_discriminator_data_batch_pos( - batch_size, data_np) - dis_loss_pos = get_training_loss(dis_training_machine, - data_batch_dis_pos) - - data_batch_dis_neg = prepare_discriminator_data_batch_neg( - generator_machine, batch_size, noise) - dis_loss_neg = get_training_loss(dis_training_machine, - data_batch_dis_neg) - - dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0 - - # Do forward pass in generator to get the gen_loss - data_batch_gen = prepare_generator_data_batch(batch_size, noise) - gen_loss = get_training_loss(gen_training_machine, data_batch_gen) - - if i % 100 == 0: - print "d_pos_loss is %s d_neg_loss is %s" % (dis_loss_pos, - dis_loss_neg) - print "d_loss is %s g_loss is %s" % (dis_loss, gen_loss) - - # Decide which network to train based on the training history - # And the relative size of the loss - if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \ - ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss): - if curr_train == "dis": - curr_strike += 1 - else: - curr_train = "dis" - curr_strike = 1 - dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg) - dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos) - copy_shared_parameters(dis_training_machine, - gen_training_machine) - - else: - if curr_train == "gen": - curr_strike += 1 - else: - curr_train = "gen" - curr_strike = 1 - gen_trainer.trainOneDataBatch(batch_size, data_batch_gen) - # TODO: add API for paddle to allow true parameter sharing between different GradientMachines - # so that we do not need to copy shared parameters. - copy_shared_parameters(gen_training_machine, - dis_training_machine) - copy_shared_parameters(gen_training_machine, generator_machine) - - dis_trainer.finishTrainPass() - gen_trainer.finishTrainPass() - # At the end of each pass, save the generated samples/images - fake_samples = get_fake_samples(generator_machine, batch_size, noise) - if data_source == "uniform": - plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % - (data_source, train_pass)) - else: - save_images(fake_samples, "./%s_samples/train_pass%s.png" % - (data_source, train_pass)) - dis_trainer.finishTrain() - gen_trainer.finishTrain() - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/mnist/.gitignore b/v1_api_demo/mnist/.gitignore deleted file mode 100644 index 7e61d5e3a0cabd46d4185454d46610ac2ee2e63f..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -data/raw_data -data/*.list -mnist_vgg_model -plot.png -train.log -*pyc -.ipynb_checkpoints -params.pkl -params.tar -params.tar.gz diff --git a/v1_api_demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py deleted file mode 100644 index e42c6cbb7e0eed4f3a3625f18d79b3de64fd8e26..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/api_train.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -""" -A very basic example for how to use current Raw SWIG API to train mnist network. - -Current implementation uses Raw SWIG, which means the API call is directly \ -passed to C++ side of Paddle. - -The user api could be simpler and carefully designed. -""" -import random - -import numpy as np -import paddle.v2 as paddle_v2 -import py_paddle.swig_paddle as api -from paddle.trainer_config_helpers import * -from py_paddle import DataProviderConverter - -from mnist_util import read_from_mnist - - -def init_parameter(network): - assert isinstance(network, api.GradientMachine) - for each_param in network.getParameters(): - assert isinstance(each_param, api.Parameter) - array_size = len(each_param) - array = np.random.uniform(-1.0, 1.0, array_size).astype('float32') - each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array) - - -def generator_to_batch(generator, batch_size): - ret_val = list() - for each_item in generator: - ret_val.append(each_item) - if len(ret_val) == batch_size: - yield ret_val - ret_val = list() - if len(ret_val) != 0: - yield ret_val - - -class BatchPool(object): - def __init__(self, generator, batch_size): - self.data = list(generator) - self.batch_size = batch_size - - def __call__(self): - random.shuffle(self.data) - for offset in xrange(0, len(self.data), self.batch_size): - limit = min(offset + self.batch_size, len(self.data)) - yield self.data[offset:limit] - - -def input_order_converter(generator): - for each_item in generator: - yield each_item['pixel'], each_item['label'] - - -def main(): - api.initPaddle("-use_gpu=false", "-trainer_count=4") # use 4 cpu cores - - optimizer = paddle_v2.optimizer.Adam( - learning_rate=1e-4, - batch_size=1000, - model_average=ModelAverage(average_window=0.5), - regularization=L2Regularization(rate=0.5)) - - # Create Local Updater. Local means not run in cluster. - # For a cluster training, here we can change to createRemoteUpdater - # in future. - updater = optimizer.create_local_updater() - assert isinstance(updater, api.ParameterUpdater) - - # define network - images = paddle_v2.layer.data( - name='pixel', type=paddle_v2.data_type.dense_vector(784)) - label = paddle_v2.layer.data( - name='label', type=paddle_v2.data_type.integer_value(10)) - hidden1 = paddle_v2.layer.fc(input=images, size=200) - hidden2 = paddle_v2.layer.fc(input=hidden1, size=200) - inference = paddle_v2.layer.fc(input=hidden2, - size=10, - act=paddle_v2.activation.Softmax()) - cost = paddle_v2.layer.classification_cost(input=inference, label=label) - - # Create Simple Gradient Machine. - model_config = paddle_v2.layer.parse_network(cost) - m = api.GradientMachine.createFromConfigProto(model_config, - api.CREATE_MODE_NORMAL, - optimizer.enable_types()) - - # This type check is not useful. Only enable type hint in IDE. - # Such as PyCharm - assert isinstance(m, api.GradientMachine) - - # Initialize Parameter by numpy. - init_parameter(network=m) - - # Initialize ParameterUpdater. - updater.init(m) - - # DataProvider Converter is a utility convert Python Object to Paddle C++ - # Input. The input format is as same as Paddle's DataProvider. - converter = DataProviderConverter(input_types=[images.type, label.type]) - - train_file = './data/raw_data/train' - test_file = './data/raw_data/t10k' - - # start gradient machine. - # the gradient machine must be started before invoke forward/backward. - # not just for training, but also for inference. - m.start() - - # evaluator can print error rate, etc. It is a C++ class. - batch_evaluator = m.makeEvaluator() - test_evaluator = m.makeEvaluator() - - # Get Train Data. - # TrainData will stored in a data pool. Currently implementation is not care - # about memory, speed. Just a very naive implementation. - train_data_generator = input_order_converter(read_from_mnist(train_file)) - train_data = BatchPool(train_data_generator, 512) - - # outArgs is Neural Network forward result. Here is not useful, just passed - # to gradient_machine.forward - outArgs = api.Arguments.createArguments(0) - - for pass_id in xrange(2): # we train 2 passes. - updater.startPass() - - for batch_id, data_batch in enumerate(train_data()): - # data_batch is input images. - # here, for online learning, we could get data_batch from network. - - # Start update one batch. - pass_type = updater.startBatch(len(data_batch)) - - # Start BatchEvaluator. - # batch_evaluator can be used between start/finish. - batch_evaluator.start() - - # forwardBackward is a shortcut for forward and backward. - # It is sometimes faster than invoke forward/backward separately, - # because in GradientMachine, it may be async. - m.forwardBackward(converter(data_batch), outArgs, pass_type) - - for each_param in m.getParameters(): - updater.update(each_param) - - # Get cost. We use numpy to calculate total cost for this batch. - cost_vec = outArgs.getSlotValue(0) - cost_vec = cost_vec.copyToNumpyMat() - cost = cost_vec.sum() / len(data_batch) - - # Make evaluator works. - m.eval(batch_evaluator) - - # Print logs. - print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \ - cost, batch_evaluator - - batch_evaluator.finish() - # Finish batch. - # * will clear gradient. - # * ensure all values should be updated. - updater.finishBatch(cost) - - # testing stage. use test data set to test current network. - updater.apply() - test_evaluator.start() - test_data_generator = input_order_converter(read_from_mnist(test_file)) - for data_batch in generator_to_batch(test_data_generator, 512): - # in testing stage, only forward is needed. - m.forward(converter(data_batch), outArgs, api.PASS_TEST) - m.eval(test_evaluator) - - # print error rate for test data set - print 'Pass', pass_id, ' test evaluator: ', test_evaluator - test_evaluator.finish() - updater.restore() - - updater.catchUpWith() - params = m.getParameters() - for each_param in params: - assert isinstance(each_param, api.Parameter) - value = each_param.getBuf(api.PARAMETER_VALUE) - value = value.copyToNumpyArray() - - # Here, we could save parameter to every where you want - print each_param.getName(), value - - updater.finishPass() - - m.finish() - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/mnist/data/generate_list.py b/v1_api_demo/mnist/data/generate_list.py deleted file mode 100644 index 49981cc7a93308bc96ad5097eba749440e958525..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/data/generate_list.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -o = open("./" + "train.list", "w") -o.write("./data/raw_data/train" + "\n") -o.close() - -o = open("./" + "test.list", "w") -o.write("./data/raw_data/t10k" + "\n") -o.close() diff --git a/v1_api_demo/mnist/data/get_mnist_data.sh b/v1_api_demo/mnist/data/get_mnist_data.sh deleted file mode 100755 index 5a2e34026d4fe7f8315d4f5453bec7c4ee4f6885..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/data/get_mnist_data.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env sh -# This scripts downloads the mnist data and unzips it. -set -e -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -rm -rf "$DIR/raw_data" -mkdir "$DIR/raw_data" -cd "$DIR/raw_data" - -echo "Downloading..." - -for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte -do - if [ ! -e $fname ]; then - wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz - gunzip ${fname}.gz - fi -done - -cd $DIR -rm -f *.list -python generate_list.py diff --git a/v1_api_demo/mnist/light_mnist.py b/v1_api_demo/mnist/light_mnist.py deleted file mode 100644 index 33409054357d2f0c6a765b3ab3164eb2e584467e..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/light_mnist.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg("is_predict", bool, False) - -####################Data Configuration ################## - -if not is_predict: - data_dir = './data/' - define_py_data_sources2( - train_list=data_dir + 'train.list', - test_list=data_dir + 'test.list', - module='mnist_provider', - obj='process') - -######################Algorithm Configuration ############# -settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer()) - -#######################Network Configuration ############# - -data_size = 1 * 28 * 28 -label_size = 10 -img = data_layer(name='pixel', size=data_size) - - -# light cnn -# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1 -# Easier to train for mnist dataset and quite efficient -# Final performance is close to deeper ones on tasks such as digital and character classification -def light_cnn(input_image, num_channels, num_classes): - def __light__(ipt, - num_filter=128, - times=1, - conv_filter_size=3, - dropouts=0, - num_channels_=None): - return img_conv_group( - input=ipt, - num_channels=num_channels_, - pool_size=2, - pool_stride=2, - conv_padding=0, - conv_num_filter=[num_filter] * times, - conv_filter_size=conv_filter_size, - conv_act=ReluActivation(), - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type=MaxPooling()) - - tmp = __light__(input_image, num_filter=128, num_channels_=num_channels) - tmp = __light__(tmp, num_filter=128) - tmp = __light__(tmp, num_filter=128) - tmp = __light__(tmp, num_filter=128, conv_filter_size=1) - - tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation()) - return tmp - - -predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size) - -if not is_predict: - lbl = data_layer(name="label", size=label_size) - inputs(img, lbl) - outputs(classification_cost(input=predict, label=lbl)) -else: - outputs(predict) diff --git a/v1_api_demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py deleted file mode 100644 index 4192339837620aada84b64a92fef3e05953971c2..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/mnist_provider.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -from paddle.trainer.PyDataProvider2 import * -from mnist_util import read_from_mnist - - -# Define a py data provider -@provider( - input_types={'pixel': dense_vector(28 * 28), - 'label': integer_value(10)}, - cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, filename): # settings is not used currently. - for each in read_from_mnist(filename): - yield each diff --git a/v1_api_demo/mnist/mnist_util.py b/v1_api_demo/mnist/mnist_util.py deleted file mode 100644 index 3fd88ae7edc821296ca0accbf6dedc083e411744..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/mnist_util.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy - -__all__ = ['read_from_mnist'] - - -def read_from_mnist(filename): - imgf = filename + "-images-idx3-ubyte" - labelf = filename + "-labels-idx1-ubyte" - f = open(imgf, "rb") - l = open(labelf, "rb") - - f.read(16) - l.read(8) - - # Define number of samples for train/test - if "train" in filename: - n = 60000 - else: - n = 10000 - - images = numpy.fromfile( - f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') - images = images / 255.0 * 2.0 - 1.0 - labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") - - for i in xrange(n): - yield {"pixel": images[i, :], 'label': labels[i]} - - f.close() - l.close() diff --git a/v1_api_demo/mnist/train.sh b/v1_api_demo/mnist/train.sh deleted file mode 100755 index ca2b1ad9eb960685b95b0f294a9b929e1a4acab1..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/train.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -config=vgg_16_mnist.py -output=./mnist_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=0 \ ---trainer_count=1 \ ---num_passes=100 \ ---save_dir=$output \ -2>&1 | tee $log -paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1 - -python -m paddle.utils.plotcurve -i $log > plot.png diff --git a/v1_api_demo/mnist/vgg_16_mnist.py b/v1_api_demo/mnist/vgg_16_mnist.py deleted file mode 100644 index a819b391c690fb473801eb2e7ba3161cc31b5b4b..0000000000000000000000000000000000000000 --- a/v1_api_demo/mnist/vgg_16_mnist.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg("is_predict", bool, False) - -####################Data Configuration ################## - -if not is_predict: - data_dir = './data/' - define_py_data_sources2( - train_list=data_dir + 'train.list', - test_list=data_dir + 'test.list', - module='mnist_provider', - obj='process') - -######################Algorithm Configuration ############# -settings( - batch_size=128, - learning_rate=0.1 / 128.0, - learning_method=MomentumOptimizer(0.9), - regularization=L2Regularization(0.0005 * 128)) - -#######################Network Configuration ############# - -data_size = 1 * 28 * 28 -label_size = 10 -img = data_layer(name='pixel', size=data_size) - -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size) - -if not is_predict: - lbl = data_layer(name="label", size=label_size) - inputs(img, lbl) - outputs(classification_cost(input=predict, label=lbl)) -else: - outputs(predict) diff --git a/v1_api_demo/model_zoo/embedding/.gitignore b/v1_api_demo/model_zoo/embedding/.gitignore deleted file mode 100644 index 908f5a3fb2f7c34368ea24d0fc3ac9cac29a4fdb..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/embedding/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -baidu.dict -model_*.emb diff --git a/v1_api_demo/model_zoo/embedding/extract_para.py b/v1_api_demo/model_zoo/embedding/extract_para.py deleted file mode 100755 index 570b90c1f772c8f6abfc6cda02560fd3471ef0b6..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/embedding/extract_para.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/env python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Example: - python extract_para.py --preModel PREMODEL --preDict PREDICT \ - --usrModel USRMODEL --usrDict USRDICT -d DIM - -Options: - -h, --help show this help message and exit - --preModel PREMODEL the name of pretrained embedding model - --preDict PREDICT the name of pretrained dictionary - --usrModel usrModel the name of output usr embedding model - --usrDict usrDict the name of user specified dictionary - -d DIM dimension of parameter -""" -from optparse import OptionParser -import struct - - -def get_row_index(preDict, usrDict): - """ - Get the row positions for all words in user dictionary from pre-trained dictionary. - return: a list of row positions - Example: preDict='a\nb\nc\n', usrDict='a\nc\n', then return [0,2] - """ - pos = [] - index = dict() - with open(preDict, "r") as f: - for line_index, line in enumerate(f): - word = line.strip().split()[0] - index[word] = line_index - with open(usrDict, "r") as f: - for line in f: - word = line.strip().split()[0] - pos.append(index[word]) - return pos - - -def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, - paraDim): - """ - Extract desired parameters from a pretrained embedding model based on user dictionary - """ - if paraDim not in [32, 64, 128, 256]: - raise RuntimeError("We only support 32, 64, 128, 256 dimensions now") - - fi = open(preModel, "rb") - fo = open(usrModel, "wb") - - # write filehead - rowIndex = get_row_index(preDict, usrDict) - newHead = struct.pack("iil", 0, 4, len(rowIndex) * paraDim) - fo.write(newHead) - bytes = 4 * paraDim - for i in range(0, len(rowIndex)): - # find the absolute position of input file - fi.seek(rowIndex[i] * bytes + 16, 0) - fo.write(fi.read(bytes)) - - print "extract parameters finish, total", len(rowIndex), "lines" - fi.close() - - -def main(): - """ - Main entry for running paraconvert.py - """ - usage = "usage: \n" \ - "python %prog --preModel PREMODEL --preDict PREDICT" \ - " --usrModel USRMODEL --usrDict USRDICT -d DIM" - parser = OptionParser(usage) - parser.add_option( - "--preModel", - action="store", - dest="preModel", - help="the name of pretrained embedding model") - parser.add_option( - "--preDict", - action="store", - dest="preDict", - help="the name of pretrained dictionary") - parser.add_option( - "--usrModel", - action="store", - dest="usrModel", - help="the name of output usr embedding model") - parser.add_option( - "--usrDict", - action="store", - dest="usrDict", - help="the name of user specified dictionary") - parser.add_option( - "-d", action="store", dest="dim", help="dimension of parameter") - (options, args) = parser.parse_args() - extract_parameters_by_usrDict(options.preModel, options.preDict, - options.usrModel, options.usrDict, - int(options.dim)) - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/model_zoo/embedding/paraconvert.py b/v1_api_demo/model_zoo/embedding/paraconvert.py deleted file mode 100755 index ce7a70efc43d7f85708f1e12bb94739f3588370c..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/embedding/paraconvert.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/env python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Example: - python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM - python paraconvert.py --t2b -i INPUT -o OUTPUT - -Options: - -h, --help show this help message and exit - --b2t convert parameter file of embedding model from binary to text - --t2b convert parameter file of embedding model from text to binary - -i INPUT input parameter file name - -o OUTPUT output parameter file name - -d DIM dimension of parameter -""" -from optparse import OptionParser -import struct - - -def binary2text(input, output, paraDim): - """ - Convert a binary parameter file of embedding model to be a text file. - input: the name of input binary parameter file, the format is: - 1) the first 16 bytes is filehead: - version(4 bytes): version of paddle, default = 0 - floatSize(4 bytes): sizeof(float) = 4 - paraCount(8 bytes): total number of parameter - 2) the next (paraCount * 4) bytes is parameters, each has 4 bytes - output: the name of output text parameter file, for example: - 0,4,32156096 - -0.7845433,1.1937413,-0.1704215,... - 0.0000909,0.0009465,-0.0008813,... - ... - the format is: - 1) the first line is filehead: - version=0, floatSize=4, paraCount=32156096 - 2) other lines print the paramters - a) each line prints paraDim paramters splitted by ',' - b) there is paraCount/paraDim lines (embedding words) - paraDim: dimension of parameters - """ - fi = open(input, "rb") - fo = open(output, "w") - """ - """ - version, floatSize, paraCount = struct.unpack("iil", fi.read(16)) - newHead = ','.join([str(version), str(floatSize), str(paraCount)]) - print >> fo, newHead - - bytes = 4 * int(paraDim) - format = "%df" % int(paraDim) - context = fi.read(bytes) - line = 0 - - while context: - numbers = struct.unpack(format, context) - lst = [] - for i in numbers: - lst.append('%8.7f' % i) - print >> fo, ','.join(lst) - context = fi.read(bytes) - line += 1 - fi.close() - fo.close() - print "binary2text finish, total", line, "lines" - - -def get_para_count(input): - """ - Compute the total number of embedding parameters in input text file. - input: the name of input text file - """ - numRows = 1 - paraDim = 0 - with open(input) as f: - line = f.readline() - paraDim = len(line.split(",")) - for line in f: - numRows += 1 - return numRows * paraDim - - -def text2binary(input, output, paddle_head=True): - """ - Convert a text parameter file of embedding model to be a binary file. - input: the name of input text parameter file, for example: - -0.7845433,1.1937413,-0.1704215,... - 0.0000909,0.0009465,-0.0008813,... - ... - the format is: - 1) it doesn't have filehead - 2) each line stores the same dimension of parameters, - the separator is commas ',' - output: the name of output binary parameter file, the format is: - 1) the first 16 bytes is filehead: - version(4 bytes), floatSize(4 bytes), paraCount(8 bytes) - 2) the next (paraCount * 4) bytes is parameters, each has 4 bytes - """ - fi = open(input, "r") - fo = open(output, "wb") - - newHead = struct.pack("iil", 0, 4, get_para_count(input)) - fo.write(newHead) - - count = 0 - for line in fi: - line = line.strip().split(",") - for i in range(0, len(line)): - binary_data = struct.pack("f", float(line[i])) - fo.write(binary_data) - count += 1 - fi.close() - fo.close() - print "text2binary finish, total", count, "lines" - - -def main(): - """ - Main entry for running paraconvert.py - """ - usage = "usage: \n" \ - "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \ - "python %prog --t2b -i INPUT -o OUTPUT" - parser = OptionParser(usage) - parser.add_option( - "--b2t", - action="store_true", - help="convert parameter file of embedding model from binary to text") - parser.add_option( - "--t2b", - action="store_true", - help="convert parameter file of embedding model from text to binary") - parser.add_option( - "-i", action="store", dest="input", help="input parameter file name") - parser.add_option( - "-o", action="store", dest="output", help="output parameter file name") - parser.add_option( - "-d", action="store", dest="dim", help="dimension of parameter") - (options, args) = parser.parse_args() - if options.b2t: - binary2text(options.input, options.output, options.dim) - if options.t2b: - text2binary(options.input, options.output) - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh b/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh deleted file mode 100755 index f61c65a935c76032a06613cfe0b50f1c90bc50d9..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x -BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding' - -DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb) -ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3 - f88c8325ee6da6187f1080e8fe66c1cd - 927cf70f27f860aff1a5703ebf7f1584 - a52e43655cd25d279777ed509a1ae27b - b92c67fe9ff70fea53596080e351ac80) - -for ((i=0; i<${#ITEM_MD5[@]}; i++)) -do - FILENAME=${DOWNLOAD_ITEMS[${i}]} - REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1` - EXPECTED_MD5=${ITEM_MD5[${i}]} - [ "${EXPECTED_MD5}" = "${REAL_MD5}" ] -done diff --git a/v1_api_demo/model_zoo/resnet/.gitignore b/v1_api_demo/model_zoo/resnet/.gitignore deleted file mode 100644 index 7a64209b62340a5c5a51626821028e63ed5e588e..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -fea_output/ -features/ -model.list -ResNet_50.dot -ResNet_50.png diff --git a/v1_api_demo/model_zoo/resnet/classify.py b/v1_api_demo/model_zoo/resnet/classify.py deleted file mode 100755 index 6074cc1d3a85e13e3e8d336d81e22104f9d8e7cf..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/classify.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import cPickle -import logging -from PIL import Image -import numpy as np -from optparse import OptionParser - -import paddle.utils.image_util as image_util - -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import dense_vector -from paddle.trainer.config_parser import parse_config - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') -logging.getLogger().setLevel(logging.INFO) - - -class ImageClassifier(): - def __init__(self, - train_conf, - model_dir=None, - resize_dim=256, - crop_dim=224, - use_gpu=True, - mean_file=None, - output_layer=None, - oversample=False, - is_color=True): - """ - train_conf: network configure. - model_dir: string, directory of model. - resize_dim: int, resized image size. - crop_dim: int, crop size. - mean_file: string, image mean file. - oversample: bool, oversample means multiple crops, namely five - patches (the four corner patches and the center - patch) as well as their horizontal reflections, - ten crops in all. - """ - self.train_conf = train_conf - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.resize_dim = resize_dim - self.crop_dims = [crop_dim, crop_dim] - self.oversample = oversample - self.is_color = is_color - - self.output_layer = output_layer - if self.output_layer: - assert isinstance(self.output_layer, basestring) - self.output_layer = self.output_layer.split(",") - - self.transformer = image_util.ImageTransformer(is_color=is_color) - self.transformer.set_transpose((2, 0, 1)) - self.transformer.set_channel_swap((2, 1, 0)) - - self.mean_file = mean_file - if self.mean_file is not None: - mean = np.load(self.mean_file)['data_mean'] - mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1]) - self.transformer.set_mean(mean) # mean pixel - else: - # if you use three mean value, set like: - # this three mean value is calculated from ImageNet. - self.transformer.set_mean(np.array([103.939, 116.779, 123.68])) - - conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu)) - conf = parse_config(train_conf, conf_args) - swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu))) - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(self.network, swig_paddle.GradientMachine) - self.network.loadParameters(self.model_dir) - - data_size = 3 * self.crop_dims[0] * self.crop_dims[1] - slots = [dense_vector(data_size)] - self.converter = DataProviderConverter(slots) - - def get_data(self, img_path): - """ - 1. load image from img_path. - 2. resize or oversampling. - 3. transformer data: transpose, channel swap, sub mean. - return K x H x W ndarray. - - img_path: image path. - """ - image = image_util.load_image(img_path, self.is_color) - # Another way to extract oversampled features is that - # cropping and averaging from large feature map which is - # calculated by large size of image. - # This way reduces the computation. - if self.oversample: - # image_util.resize_image: short side is self.resize_dim - image = image_util.resize_image(image, self.resize_dim) - image = np.array(image) - input = np.zeros( - (1, image.shape[0], image.shape[1], 3), dtype=np.float32) - input[0] = image.astype(np.float32) - input = image_util.oversample(input, self.crop_dims) - else: - image = image.resize(self.crop_dims, Image.ANTIALIAS) - input = np.zeros( - (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32) - input[0] = np.array(image).astype(np.float32) - - data_in = [] - for img in input: - img = self.transformer.transformer(img).flatten() - data_in.append([img.tolist()]) - # paddle input: [[[]],[[]],...], [[]] is one sample. - return data_in - - def forward(self, input_data): - """ - return output arguments which are the Outputs() in network configure. - - input_data: py_paddle input data. - call forward. - """ - in_arg = self.converter(input_data) - return self.network.forwardTest(in_arg) - - def forward(self, data, output_layer): - """ - return output arguments which are the Outputs() in network configure. - - input_data: py_paddle input data. - call forward. - """ - input = self.converter(data) - self.network.forwardTest(input) - output = self.network.getLayerOutputs(output_layer) - res = {} - if isinstance(output_layer, basestring): - output_layer = [output_layer] - for name in output_layer: - # For oversampling, average predictions across crops. - # If not, the shape of output[name]: (1, class_number), - # the mean is also applicable. - res[name] = output[name]['value'].mean(0) - - return res - - def predict(self, data_file): - """ - call forward and predicting. - - data_file: input image list. - """ - image_files = open(data_file, 'rb').readlines() - results = {} - if self.output_layer is None: - self.output_layer = ["output"] - for line in image_files: - image = line.split()[0] - data = self.get_data(image) - prob = self.forward(data, self.output_layer) - lab = np.argsort(-prob[self.output_layer[0]]) - results[image] = lab[0] - logging.info("Label of %s is: %d", image, lab[0]) - return results - - def extract(self, data_file, output_dir, batch_size=10000): - """ - extract and save features of output layers, which are - specify in Outputs() in network configure. - - data_file: file name of input data. - output_dir: saved directory of extracted features. - batch_size: sample number of one batch file. - """ - if not os.path.exists(output_dir): - os.mkdir(output_dir) - - sample_num = 0 - batch_num = 0 - image_feature = {} - image_files = open(data_file, 'rb').readlines() - for idx, line in enumerate(image_files): - image = line.split()[0] - data = self.get_data(image) - feature = self.forward(data, self.output_layer) - # save extracted features - file_name = image.split("/")[-1] - image_feature[file_name] = feature - sample_num += 1 - if sample_num == batch_size: - batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num)) - self.save_file(image_feature, batch_name) - logging.info('Finish batch %d', batch_num) - batch_num += 1 - sample_num = 0 - image_feature = {} - if idx % 1000 == 0: - logging.info('%d/%d, %s', idx, len(image_files), file_name) - if sample_num > 0: - batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num)) - self.save_file(image_feature, batch_name) - logging.info('Finish batch %d', batch_num) - logging.info('Done: make image feature batch') - - def save_file(self, data, file): - of = open(file, 'wb') - cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL) - - -def option_parser(): - """ - Main entry for predciting - """ - usage = "%prog -c config -i data_list -w model_dir [options]" - parser = OptionParser(usage="usage: %s" % usage) - parser.add_option( - "-j", - "--job", - action="store", - dest="job_type", - help="job type: predict, extract\ - predict: predicting,\ - extract: extract features") - parser.add_option( - "-c", - "--conf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-i", "--data", action="store", dest="data_file", help="image list") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - parser.add_option( - "-g", - "--use_gpu", - action="store", - dest="use_gpu", - default=True, - help="Whether to use gpu mode.") - parser.add_option( - "-o", - "--output_dir", - action="store", - dest="output_dir", - default="output", - help="output path") - parser.add_option( - "-m", - "--mean", - action="store", - dest="mean", - default=None, - help="mean file.") - parser.add_option( - "-p", - "--multi_crop", - action="store_true", - dest="multi_crop", - default=False, - help="Wether to use multiple crops on image.") - parser.add_option("-l", "--output_layer", action="store", - dest="output_layer", default=None, - help="--job=extract, specify layers to extract "\ - "features, --job=predict, specify layer of " - "classification probability, output in resnet.py.") - return parser.parse_args() - - -def main(): - """ - 1. parse input arguments. - 2. predicting or extract features according job type. - """ - options, args = option_parser() - obj = ImageClassifier( - options.train_conf, - options.model_path, - use_gpu=options.use_gpu, - mean_file=options.mean, - output_layer=options.output_layer, - oversample=options.multi_crop) - if options.job_type == "predict": - obj.predict(options.data_file) - - elif options.job_type == "extract": - obj.extract(options.data_file, options.output_dir) - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/model_zoo/resnet/example/.gitignore b/v1_api_demo/model_zoo/resnet/example/.gitignore deleted file mode 100644 index 4a2b5962a6800f251cba655c026331f14648c86e..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/example/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*image_list_provider_copy_1.py diff --git a/v1_api_demo/model_zoo/resnet/example/__init__.py b/v1_api_demo/model_zoo/resnet/example/__init__.py deleted file mode 100644 index f662d6826321eb840739382558f76327d27b5847..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/example/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/v1_api_demo/model_zoo/resnet/example/cat.jpg b/v1_api_demo/model_zoo/resnet/example/cat.jpg deleted file mode 100644 index 47b01db90eddc46ff845f10bc2accaf2364c272d..0000000000000000000000000000000000000000 Binary files a/v1_api_demo/model_zoo/resnet/example/cat.jpg and /dev/null differ diff --git a/v1_api_demo/model_zoo/resnet/example/dog.jpg b/v1_api_demo/model_zoo/resnet/example/dog.jpg deleted file mode 100644 index b9cc33cf069da5c453b97dbb7383838edd07c199..0000000000000000000000000000000000000000 Binary files a/v1_api_demo/model_zoo/resnet/example/dog.jpg and /dev/null differ diff --git a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py b/v1_api_demo/model_zoo/resnet/example/image_list_provider.py deleted file mode 100644 index 2cd8eb8bf850f41282ed5db2885dc0b7218c79f7..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.utils.image_util import * -from paddle.trainer.PyDataProvider2 import * - - -def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs): - """ - Description: Init with a list of data file - file_list is the name list of input files. - kwargs["load_data_args"] is the value of 'load_data_args' - which can be set in config. - Each args is separated by a column. - image_size: the crop image size. - mean_meta: the path of the meta file to store the mean image. - mean_value: can be mean value, not a file. - can not set mean_meta and mean_value at the same time. - color: 'color' means a color image. Otherwise, it means a gray image. - is_train: whether the data provider is used for training. - Data argumentation might be different for training and testing. - """ - settings.img_size = image_size - settings.crop_size = crop_size - settings.mean_img_size = settings.crop_size - settings.color = color # default is color - settings.is_train = is_train - - settings.is_swap_channel = kwargs.get('swap_channel', None) - if settings.is_swap_channel is not None: - settings.swap_channel = settings.is_swap_channel - settings.is_swap_channel = True - - if settings.color: - settings.img_input_size = settings.crop_size * settings.crop_size * 3 - else: - settings.img_input_size = settings.crop_size * settings.crop_size - - settings.file_list = file_list - settings.mean_meta = kwargs.get('mean_meta', None) - settings.mean_value = kwargs.get('mean_value', None) - # can not specify both mean_meta and mean_value. - assert not (settings.mean_meta and settings.mean_value) - if not settings.mean_meta: - settings.mean_value = kwargs.get('mean_value') - sz = settings.crop_size * settings.crop_size - settings.img_mean = np.zeros(sz * 3, dtype=np.single) - for idx, value in enumerate(settings.mean_value): - settings.img_mean[idx * sz:(idx + 1) * sz] = value - settings.img_mean = settings.img_mean.reshape(3, settings.crop_size, - settings.crop_size) - - else: - settings.img_mean = load_meta(settings.mean_meta, - settings.mean_img_size, - settings.crop_size, settings.color) - - settings.input_types = [ - dense_vector(settings.img_input_size), # image feature - integer_value(1) - ] # labels - - settings.logger.info('Image short side: %s', settings.img_size) - settings.logger.info('Crop size: %s', settings.crop_size) - settings.logger.info('Meta path: %s', settings.mean_meta) - if settings.is_swap_channel: - settings.logger.info('swap channel: %s', settings.swap_channel) - settings.logger.info('DataProvider Initialization finished') - - -@provider(init_hook=hook, should_shuffle=False) -def processData(settings, file_list): - """ - The main function for loading data. - Load the batch, iterate all the images and labels in this batch. - file_name: the batch file name. - """ - img_path, lab = file_list.strip().split(' ') - img = Image.open(img_path) - img.load() - img = img.resize((settings.img_size, settings.img_size), Image.ANTIALIAS) - img = np.array(img).astype(np.float32) - if len(img.shape) == 3: - img = np.swapaxes(img, 1, 2) - img = np.swapaxes(img, 1, 0) - # swap channel - if settings.is_swap_channel: - img = img[settings.swap_channel, :, :] - img_feat = preprocess_img(img, settings.img_mean, settings.crop_size, - settings.is_train, settings.color) - yield img_feat.tolist(), int(lab.strip()) diff --git a/v1_api_demo/model_zoo/resnet/example/test.list b/v1_api_demo/model_zoo/resnet/example/test.list deleted file mode 100644 index 30bbf630b640a26239fc104c9c08f6ebc9dfaa82..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/example/test.list +++ /dev/null @@ -1,2 +0,0 @@ -example/dog.jpg 0 -example/cat.jpg 0 diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh b/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh deleted file mode 100755 index 5447aa92dfb5facd3433eb4a1893e96e3c786c73..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#set names of layer which you want to extract feature -#in Outputs() of resnet.py -#like: Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn") -layer_num=50 -configure=./resnet.py -model_path=./model/resnet_$layer_num -fea_dir=fea_output -#Output is text file. -#Each line is one sample's features. -#If you set N layer names in Outputs() -#each line contains N features sperated by ";". - -# create model list file. -model_list=./model.list -touch $model_list | echo $model_path > $model_list - -paddle train \ - --local=true \ - --job=test \ - --config=$configure \ - --model_list=$model_list \ - --use_gpu=1 \ - --predict_output_dir=$fea_dir \ - --config_args=is_test=1,layer_num=$layer_num diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh b/v1_api_demo/model_zoo/resnet/extract_fea_py.sh deleted file mode 100755 index 2e87152f7f8598f487870291271cdee646105044..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#Note if you use CPU mode, you need to set use_gpu=0 in classify.py. like this: -#conf_args = "is_test=0,use_gpu=1,is_predict=1" -#conf = parse_config(train_conf, conf_args) -#swig_paddle.initPaddle("--use_gpu=0") -python classify.py \ - --job=extract \ - --conf=resnet.py \ - --use_gpu=1 \ - --mean=model/mean_meta_224/mean.meta \ - --model=model/resnet_50 \ - --data=./example/test.list \ - --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \ - --output_dir=features diff --git a/v1_api_demo/model_zoo/resnet/get_model.sh b/v1_api_demo/model_zoo/resnet/get_model.sh deleted file mode 100755 index b33d8178ab7859fc0b0d514fb19bec2c28a77c3d..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/get_model.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -mkdir model -cd model - -echo "Downloading ResNet models..." - -for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz -do - wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file - tar -xvf $file - rm $file -done - -echo "Done." diff --git a/v1_api_demo/model_zoo/resnet/load_feature.py b/v1_api_demo/model_zoo/resnet/load_feature.py deleted file mode 100644 index 5d3d0c0d30ef710c37c98e93a51b2f813d636b59..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/load_feature.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import cPickle -import logging - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') -logging.getLogger().setLevel(logging.INFO) - - -def load_feature_c(file): - """ - Load feature extracted by C++ interface. - Return a list. - file: feature file. - """ - features = [] - f = open(file, 'r') - for line in f: - sample = [] - for slot in line.strip().split(";"): - fea = [float(val) for val in slot.strip().split()] - if fea: - sample.append(fea) - features.append(sample) - f.close() - return features - - -def load_feature_py(feature_dir): - """ - Load feature extracted by python interface. - Return a dictionary. - feature_dir: directory of feature file. - """ - file_list = os.listdir(feature_dir) - file_list = [os.path.join(feature_dir, f) for f in file_list] - features = {} - for file_name in file_list: - with open(file_name, 'rb') as f: - feature = cPickle.load(f) - features.update(feature) - logging.info('Load feature file %s', file_name) - return features - - -if __name__ == '__main__': - print load_feature_py(sys.argv[1]) - #print load_feature_c(sys.argv[1]) diff --git a/v1_api_demo/model_zoo/resnet/net_diagram.sh b/v1_api_demo/model_zoo/resnet/net_diagram.sh deleted file mode 100755 index 1b06ffa44eec8a0f312420c35699d3902f9a6400..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/net_diagram.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -:' -Visual deep residual network -1. Using make_model_diagram.py to generate dot file. -2. Using graphviz to convert dot file. - -Usage: -./net_diagram.sh -' - -set -e - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -img_type=png -img_fileprefix=ResNet_50 -conf_filename=resnet.py -dot_filename=ResNet_50.dot -config_str="layer_num=50,data_provider=0" - -python -m paddle.utils.make_model_diagram $conf_filename $dot_filename $config_str - -# If you have installed graphviz, running like this: -# dot -Tpng -o ResNet.png ResNet.dot diff --git a/v1_api_demo/model_zoo/resnet/predict.sh b/v1_api_demo/model_zoo/resnet/predict.sh deleted file mode 100755 index 2b67b17c48c60cc8a7b7c46a1c80a3f2bf281870..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/predict.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -python classify.py \ - --job=predict \ - --conf=resnet.py\ - --model=model/resnet_50 \ - --multi_crop \ - --use_gpu=1 \ - --data=./example/test.list diff --git a/v1_api_demo/model_zoo/resnet/resnet.py b/v1_api_demo/model_zoo/resnet/resnet.py deleted file mode 100644 index 6fdd97fefc62392c93ecffae0fc918e8dc4b18c5..0000000000000000000000000000000000000000 --- a/v1_api_demo/model_zoo/resnet/resnet.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * -""" -paper: https://arxiv.org/abs/1512.03385 -""" -is_test = get_config_arg("is_test", bool, False) -is_predict = get_config_arg("is_predict", bool, False) -data_provider = get_config_arg("data_provider", bool, True) -layer_num = get_config_arg("layer_num", int, 50) - -if not is_predict and data_provider: - train_list = 'train.list' if not is_test else None - # mean.meta is mean file of ImageNet dataset. - # mean.meta size : 3 x 224 x 224. - # If you use three mean value, set like: - # "mean_value:103.939,116.779,123.68;" - args = { - 'mean_meta': "model/mean_meta_224/mean.meta", - 'image_size': 224, - 'crop_size': 224, - 'color': True, - 'swap_channel:': [2, 1, 0] - } - define_py_data_sources2( - train_list, - 'example/test.list', - module="example.image_list_provider", - obj="processData", - args=args) - -batch_size = 1 -learning_rate = 0.1 / batch_size -momentum = 0.9 -weight_decay = 0.0001 * batch_size -default_momentum(momentum) -default_decay_rate(weight_decay) - -Settings( - algorithm='sgd', - batch_size=batch_size, - learning_rate=learning_rate, - - # set the appropriate parameters according your schedule - learning_method='momentum', - learning_rate_decay_a=0.5, - learning_rate_decay_b=1200000 * 10, - learning_rate_schedule="discexp", ) - - -def conv_bn_layer(name, - input, - filter_size, - num_filters, - stride, - padding, - channels=None, - active_type=ReluActivation()): - """ - A wrapper for conv layer with batch normalization layers. - Note: - conv layer has no activation. - """ - - tmp = img_conv_layer( - name=name + "_conv", - input=input, - filter_size=filter_size, - num_channels=channels, - num_filters=num_filters, - stride=stride, - padding=padding, - act=LinearActivation(), - bias_attr=False) - return batch_norm_layer( - name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test) - - -def bottleneck_block(name, input, num_filters1, num_filters2): - """ - A wrapper for bottlenect building block in ResNet. - Last conv_bn_layer has no activation. - Addto layer has activation of relu. - """ - last_name = conv_bn_layer( - name=name + '_branch2a', - input=input, - filter_size=1, - num_filters=num_filters1, - stride=1, - padding=0) - last_name = conv_bn_layer( - name=name + '_branch2b', - input=last_name, - filter_size=3, - num_filters=num_filters1, - stride=1, - padding=1) - last_name = conv_bn_layer( - name=name + '_branch2c', - input=last_name, - filter_size=1, - num_filters=num_filters2, - stride=1, - padding=0, - active_type=LinearActivation()) - - return addto_layer( - name=name + "_addto", input=[input, last_name], act=ReluActivation()) - - -def mid_projection(name, input, num_filters1, num_filters2, stride=2): - """ - A wrapper for middile projection in ResNet. - projection shortcuts are used for increasing dimensions, - and other shortcuts are identity - branch1: projection shortcuts are used for increasing - dimensions, has no activation. - branch2x: bottleneck building block, shortcuts are identity. - """ - # stride = 2 - branch1 = conv_bn_layer( - name=name + '_branch1', - input=input, - filter_size=1, - num_filters=num_filters2, - stride=stride, - padding=0, - active_type=LinearActivation()) - - last_name = conv_bn_layer( - name=name + '_branch2a', - input=input, - filter_size=1, - num_filters=num_filters1, - stride=stride, - padding=0) - last_name = conv_bn_layer( - name=name + '_branch2b', - input=last_name, - filter_size=3, - num_filters=num_filters1, - stride=1, - padding=1) - - last_name = conv_bn_layer( - name=name + '_branch2c', - input=last_name, - filter_size=1, - num_filters=num_filters2, - stride=1, - padding=0, - active_type=LinearActivation()) - - return addto_layer( - name=name + "_addto", input=[branch1, last_name], act=ReluActivation()) - - -def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): - """ - A wrapper for 50,101,152 layers of ResNet. - res2_num: number of blocks stacked in conv2_x - res3_num: number of blocks stacked in conv3_x - res4_num: number of blocks stacked in conv4_x - res5_num: number of blocks stacked in conv5_x - """ - # For ImageNet - # conv1: 112x112 - img = data_layer(name='input', size=224 * 224 * 3) - tmp = conv_bn_layer( - "conv1", - img, - filter_size=7, - channels=3, - num_filters=64, - stride=2, - padding=3) - tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2) - - # conv2_x: 56x56 - tmp = mid_projection( - name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1) - for i in xrange(2, res2_num + 1, 1): - tmp = bottleneck_block( - name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256) - - # conv3_x: 28x28 - tmp = mid_projection( - name="res3_1", input=tmp, num_filters1=128, num_filters2=512) - for i in xrange(2, res3_num + 1, 1): - tmp = bottleneck_block( - name="res3_" + str(i), - input=tmp, - num_filters1=128, - num_filters2=512) - - # conv4_x: 14x14 - tmp = mid_projection( - name="res4_1", input=tmp, num_filters1=256, num_filters2=1024) - for i in xrange(2, res4_num + 1, 1): - tmp = bottleneck_block( - name="res4_" + str(i), - input=tmp, - num_filters1=256, - num_filters2=1024) - - # conv5_x: 7x7 - tmp = mid_projection( - name="res5_1", input=tmp, num_filters1=512, num_filters2=2048) - for i in xrange(2, res5_num + 1, 1): - tmp = bottleneck_block( - name="res5_" + str(i), - input=tmp, - num_filters1=512, - num_filters2=2048) - - tmp = img_pool_layer( - name='avgpool', - input=tmp, - pool_size=7, - stride=1, - pool_type=AvgPooling()) - - output = fc_layer( - name='output', input=tmp, size=1000, act=SoftmaxActivation()) - - if not is_predict: - classification_cost( - input=output, label=data_layer( - name='label', size=1)) - - -def res_net_50(): - deep_res_net(3, 4, 6, 3) - - -def res_net_101(): - deep_res_net(3, 4, 23, 3) - - -def res_net_152(): - deep_res_net(3, 8, 36, 3) - - -if not is_predict: - Inputs("input", "label") -else: - Inputs("input") -# Outputs("cost-softmax" if not is_predict else "output") -Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn") - -if layer_num == 50: - res_net_50() -elif layer_num == 101: - res_net_101() -elif layer_num == 152: - res_net_152() -else: - print("Wrong layer number.") diff --git a/v1_api_demo/quick_start/.gitignore b/v1_api_demo/quick_start/.gitignore deleted file mode 100644 index f71662563ff96d6227dd568d9951a90b0d09456e..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/.gitignore +++ /dev/null @@ -1,15 +0,0 @@ -*.pyc -data/dict.txt -data/dict_all.txt -data/labels.list -data/mosesdecoder-master/ -data/reviews_Electronics_5.json.gz -data/test.list -data/test.txt -data/train.list -data/train.txt -data/pred.list -data/pred.txt -dataprovider_copy_1.py -train.log -output diff --git a/v1_api_demo/quick_start/api_predict.py b/v1_api_demo/quick_start/api_predict.py deleted file mode 100755 index 9bdffe1006281c58a595e2771561ba62e4c2d6bd..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/api_predict.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import sparse_binary_vector -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python api_predict.py -h -""" - - -class QuickStartPrediction(): - def __init__(self, train_conf, dict_file, model_dir=None, label_file=None): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - self.train_conf = train_conf - self.dict_file = dict_file - self.word_dict = {} - self.dict_dim = self.load_dict() - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.label = None - if label_file is not None: - self.load_label(label_file) - - conf = parse_config(train_conf, "is_predict=1") - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(self.model_dir) - input_types = [sparse_binary_vector(self.dict_dim)] - self.converter = DataProviderConverter(input_types) - - def load_dict(self): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(self.dict_file, 'r')): - self.word_dict[line.strip().split('\t')[0]] = line_count - return len(self.word_dict) - - def load_label(self, label_file): - """ - Load label. - """ - self.label = {} - for v in open(label_file, 'r'): - self.label[int(v.split('\t')[1])] = v.split('\t')[0] - - def get_index(self, data): - """ - transform word into integer index according to the dictionary. - """ - words = data.strip().split() - word_slot = [self.word_dict[w] for w in words if w in self.word_dict] - return word_slot - - def batch_predict(self, data_batch): - input = self.converter(data_batch) - output = self.network.forwardTest(input) - prob = output[0]["id"].tolist() - print("predicting labels is:") - print prob - - -def option_parser(): - usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-n", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-b", - "--label", - action="store", - dest="label", - default=None, - help="dictionary file") - parser.add_option( - "-c", - "--batch_size", - type="int", - action="store", - dest="batch_size", - default=1, - help="the batch size for prediction") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - batch_size = options.batch_size - dict_file = options.dict_file - model_path = options.model_path - label = options.label - swig_paddle.initPaddle("--use_gpu=0") - predict = QuickStartPrediction(train_conf, dict_file, model_path, label) - - batch = [] - labels = [] - for line in sys.stdin: - [label, text] = line.split("\t") - labels.append(int(label)) - batch.append([predict.get_index(text)]) - print("labels is:") - print labels - predict.batch_predict(batch) - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/quick_start/api_predict.sh b/v1_api_demo/quick_start/api_predict.sh deleted file mode 100755 index 4d9aa9e8854ed79446a47dbc593f419cdda077b4..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/api_predict.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -#only test on trainer_config.lr.py -model=output/model/pass-00001/ -config=trainer_config.lr.py -label=data/labels.list -dict=data/dict.txt -batch_size=20 -head -n$batch_size data/test.txt | python api_predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=$dict \ - --batch_size=$batch_size diff --git a/v1_api_demo/quick_start/api_train.py b/v1_api_demo/quick_start/api_train.py deleted file mode 100644 index 5699789daa4051661b0a72c69f4668f2d8bb9cb2..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/api_train.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import itertools -import random - -from paddle.trainer.config_parser import parse_config -from py_paddle import swig_paddle as api -from py_paddle import DataProviderConverter -from paddle.trainer.PyDataProvider2 \ - import integer_value, integer_value_sequence, sparse_binary_vector - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--train_data", type=str, required=False, help="train data file") - parser.add_argument("--test_data", type=str, help="test data file") - parser.add_argument( - "--config", type=str, required=True, help="config file name") - parser.add_argument("--dict_file", required=True, help="dictionary file") - parser.add_argument( - "--seq", default=1, type=int, help="whether use sequence training") - parser.add_argument( - "--use_gpu", default=0, type=int, help="whether use GPU for training") - parser.add_argument( - "--trainer_count", - default=1, - type=int, - help="Number of threads for training") - parser.add_argument( - "--num_passes", default=5, type=int, help="Number of training passes") - return parser.parse_args() - - -UNK_IDX = 0 - - -def load_data(file_name, word_dict): - with open(file_name, 'r') as f: - for line in f: - label, comment = line.strip().split('\t') - words = comment.split() - word_slot = [word_dict.get(w, UNK_IDX) for w in words] - yield word_slot, int(label) - - -def load_dict(dict_file): - word_dict = dict() - with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - return word_dict - - -def main(): - options = parse_arguments() - api.initPaddle("--use_gpu=%s" % options.use_gpu, - "--trainer_count=%s" % options.trainer_count) - - word_dict = load_dict(options.dict_file) - train_dataset = list(load_data(options.train_data, word_dict)) - if options.test_data: - test_dataset = list(load_data(options.test_data, word_dict)) - else: - test_dataset = None - - trainer_config = parse_config(options.config, - "dict_file=%s" % options.dict_file) - # No need to have data provider for trainer - trainer_config.ClearField('data_config') - trainer_config.ClearField('test_data_config') - - # create a GradientMachine from the model configuratin - model = api.GradientMachine.createFromConfigProto( - trainer_config.model_config) - # create a trainer for the gradient machine - trainer = api.Trainer.create(trainer_config, model) - - # create a data converter which converts data to PaddlePaddle - # internal format - input_types = [ - integer_value_sequence(len(word_dict)) if options.seq else - sparse_binary_vector(len(word_dict)), integer_value(2) - ] - converter = DataProviderConverter(input_types) - - batch_size = trainer_config.opt_config.batch_size - trainer.startTrain() - for train_pass in xrange(options.num_passes): - trainer.startTrainPass() - random.shuffle(train_dataset) - for pos in xrange(0, len(train_dataset), batch_size): - batch = itertools.islice(train_dataset, pos, pos + batch_size) - size = min(batch_size, len(train_dataset) - pos) - trainer.trainOneDataBatch(size, converter(batch)) - trainer.finishTrainPass() - if test_dataset: - trainer.startTestPeriod() - for pos in xrange(0, len(test_dataset), batch_size): - batch = itertools.islice(test_dataset, pos, pos + batch_size) - size = min(batch_size, len(test_dataset) - pos) - trainer.testOneDataBatch(size, converter(batch)) - trainer.finishTestPeriod() - trainer.finishTrain() - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/quick_start/api_train.sh b/v1_api_demo/quick_start/api_train.sh deleted file mode 100755 index 9b2a4e2f224b1677c458ede66a6a3bac09d8ad61..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/api_train.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -# Note: if using trainer_config.emb.py, trainer_config.cnn.py -# or trainer_config.lstm.py, you need to change --seq to --seq=1 -# because they are sequence models. -python api_train.py \ - --config=trainer_config.lr.py \ - --trainer_count=2 \ - --num_passes=15 \ - --use_gpu=0 \ - --seq=0 \ - --train_data=data/train.txt \ - --test_data=data/test.txt \ - --dict_file=data/dict.txt \ - 2>&1 | tee 'train.log' diff --git a/v1_api_demo/quick_start/cluster/cluster_train.sh b/v1_api_demo/quick_start/cluster/cluster_train.sh deleted file mode 100755 index a7b1f01064b29cf6abc4cd6b706ee466a6d6da36..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/cluster/cluster_train.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -# Should run pserver.sh before run this script. -bin_dir=$(cd `dirname $0`; pwd) -home_dir=$(cd "${bin_dir}/.."; pwd) -source "$bin_dir/env.sh" - -model_dir="$bin_dir/output" -log_file="$bin_dir/train.log" - -pushd "$home_dir" -cfg=trainer_config.lr.py -paddle train \ - --start_pserver=false \ - --config=$cfg \ - --save_dir=${model_dir} \ - --trainer_count=4 \ - --local=0 \ - --log_period=100 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - --num_gradient_servers=1 \ - --nics=`get_nics` \ - --port=7164 \ - --ports_num=1 \ - --pservers="127.0.0.1" \ - --comment="paddle_trainer" \ - 2>&1 | tee "$log_file" -popd diff --git a/v1_api_demo/quick_start/cluster/env.sh b/v1_api_demo/quick_start/cluster/env.sh deleted file mode 100644 index a404993835d0e479f65c89c5561855293b7b66f0..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/cluster/env.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_nics() { - machine=`uname -s` - local nics="" - if [ "$machine" == "Linux" ]; then - nics="lo" - elif [ "$machine" == "Darwin" ]; then - nics="lo0" - else - nics="unsupport" - fi - echo $nics -} diff --git a/v1_api_demo/quick_start/cluster/pserver.sh b/v1_api_demo/quick_start/cluster/pserver.sh deleted file mode 100755 index b187c1d9b9108a607ed310253d54ecc096f0e792..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/cluster/pserver.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -bin_dir=$(cd `dirname $0`; pwd) -source "$bin_dir/env.sh" - -paddle pserver \ - --nics=`get_nics` \ - --port=7164 \ - --ports_num=1 \ - --ports_num_for_sparse=1 \ - --num_gradient_servers=1 \ - --comment="paddle_pserver" \ - 2>&1 | tee 'pserver.log' diff --git a/v1_api_demo/quick_start/data/README.md b/v1_api_demo/quick_start/data/README.md deleted file mode 100644 index 63abcf7ebf31903213e44cf492b93e09f61db14e..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/data/README.md +++ /dev/null @@ -1,9 +0,0 @@ -This dataset consists of electronics product reviews associated with -binary labels (positive/negative) for sentiment classification. - -The preprocessed data can be downloaded by script `get_data.sh`. -The data was derived from reviews_Electronics_5.json.gz at - -http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz - -If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`. diff --git a/v1_api_demo/quick_start/data/get_data.sh b/v1_api_demo/quick_start/data/get_data.sh deleted file mode 100755 index a09a18f919e5a84f1f7c889a43f0a5fbf4a60a77..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/data/get_data.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -# Download the preprocessed data -wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz - -# Extract package -tar zxvf preprocessed_data.tar.gz - -# Remove compressed package -rm preprocessed_data.tar.gz diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh b/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh deleted file mode 100755 index d976eaebfaa600778e0ab6bb0adbd7159f1cce2f..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# 1. size of pos : neg = 1:1. -# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set. -# 3. distinct train set and test set. - -set -e - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -# Download data -echo "Downloading Amazon Electronics reviews data..." -# http://jmcauley.ucsd.edu/data/amazon/ -wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -echo "Downloading mosesdecoder..." -# https://github.com/moses-smt/mosesdecoder -wget https://github.com/moses-smt/mosesdecoder/archive/master.zip - -unzip master.zip -rm master.zip - -################## -# Preprocess data -echo "Preprocess data..." -export LC_ALL=C -UNAME_STR=`uname` - -if [ ${UNAME_STR} == 'Linux' ]; then - SHUF_PROG='shuf' -else - SHUF_PROG='gshuf' -fi - -mkdir -p tmp -python preprocess.py -i reviews_Electronics_5.json.gz -# uniq and shuffle -cd tmp -echo 'Uniq and shuffle...' -cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed -cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed - -min_len=`sed -n '$=' neg.shuffed` -test_num=$((min_len/10)) -if [ $test_num -gt 12500 ];then - test_num=12500 -fi -train_num=$((min_len-test_num)) - -head -n$train_num pos.shuffed >train.pos -head -n$train_num neg.shuffed >train.neg -tail -n$test_num pos.shuffed >test.pos -tail -n$test_num neg.shuffed >test.neg - -cat train.pos train.neg | ${SHUF_PROG} >../train.txt -cat test.pos test.neg | ${SHUF_PROG} >../test.txt - -cd - -echo 'train.txt' > train.list -echo 'test.txt' > test.list - -# use 30k dict -rm -rf tmp -mv dict.txt dict_all.txt -cat dict_all.txt | head -n 30001 > dict.txt -echo 'Done.' diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py deleted file mode 100755 index 5706351a21fbd15d9bbf197156bb0fdabcb07295..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# -*- coding: UTF-8 -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -1. Tokenize the words and punctuation -2. pos sample : rating score 5; neg sample: rating score 1-2. - -Usage: - python preprocess.py -i data_file [random seed] -""" - -import sys -import os -import operator -import gzip -from subprocess import Popen, PIPE -from optparse import OptionParser -import json -from multiprocessing import Queue -from multiprocessing import Pool -import multiprocessing - -batch_size = 5000 -word_count = {} -num_tokenize = max(1, - multiprocessing.cpu_count() - 2) # parse + tokenize + save -max_queue_size = 8 -parse_queue = Queue(maxsize=max_queue_size + num_tokenize) -tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize) - - -def create_dict(data): - """ - Create dictionary based on data, and saved in data_dir/dict.txt. - The first line is unk \t -1. - data: list, input data by batch. - """ - for seq in data: - try: - for w in seq.lower().split(): - if w not in word_count: - word_count[w] = 1 - else: - word_count[w] += 1 - except: - sys.stderr.write(seq + "\tERROR\n") - - -def parse(path): - """ - Open .gz file. - """ - sys.stderr.write(path) - g = gzip.open(path, 'r') - for l in g: - yield json.loads(l) - g.close() - - -def tokenize(sentences): - """ - Use tokenizer.perl to tokenize input sentences. - tokenizer.perl is tool of Moses. - sentences : a list of input sentences. - return: a list of processed text. - """ - dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl' - if not os.path.exists(dir): - sys.exit( - "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists." - ) - tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] - assert isinstance(sentences, list) - text = "\n".join(sentences) - tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) - tok_text, _ = tokenizer.communicate(text) - toks = tok_text.split('\n')[:-1] - return toks - - -def save_data(instance, data_dir, pre_fix, batch_num): - """ - save data by batch - """ - label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))] - lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))] - file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num)) - file(file_name, 'w').write('\n'.join(lines) + '\n') - - -def tokenize_batch(id): - """ - tokenize data by batch - """ - while True: - num_batch, instance, pre_fix = parse_queue.get() - if num_batch == -1: ### parse_queue finished - tokenize_queue.put((-1, None, None)) - sys.stderr.write("Thread %s finish\n" % (id)) - break - tokenize_instance = tokenize(instance) - tokenize_queue.put((num_batch, tokenize_instance, pre_fix)) - sys.stderr.write('.') - - -def save_batch(data_dir, num_tokenize, data_dir_dict): - """ - save data by batch - build dict.txt - """ - token_count = 0 - while True: - num_batch, instance, pre_fix = tokenize_queue.get() - if num_batch == -1: - token_count += 1 - if token_count == num_tokenize: #### tokenize finished. - break - else: - continue - save_data(instance, data_dir, pre_fix, num_batch) - create_dict(instance) ## update dict - - sys.stderr.write("save file finish\n") - f = open(data_dir_dict, 'w') - f.write('%s\t%s\n' % ('unk', '-1')) - for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \ - reverse=True): - f.write('%s\t%s\n' % (k, v)) - f.close() - sys.stderr.write("build dict finish\n") - - -def parse_batch(data, num_tokenize): - """ - parse data by batch - parse -> tokenize -> save - """ - raw_txt = parse(data) - neg, pos = [], [] - count = 0 - sys.stderr.write("extract raw data\n") - for l in raw_txt: - rating = l["overall"] - text = l["reviewText"].lower() # # convert words to lower case - if rating == 5.0 and text: - pos.append(text) - if rating < 3.0 and text: - neg.append(text) - if len(pos) == batch_size or len(neg) == batch_size: - if len(pos) == batch_size: - batch = pos - pre_fix = 'pos' - else: - batch = neg - pre_fix = 'neg' - - parse_queue.put((count, batch, pre_fix)) - count += 1 - if pre_fix == 'pos': - pos = [] - else: - neg = [] - - if len(pos) > 0: - parse_queue.put((count, pos, 'pos')) - count += 1 - if len(neg) > 0: - parse_queue.put((count, neg, 'neg')) - count += 1 - for i in range(num_tokenize): - parse_queue.put((-1, None, None)) #### for tokenize's input finished - sys.stderr.write("parsing finish\n") - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_path [options]") - parser.add_option( - "-i", "--data", action="store", dest="input", help="Input data path.") - parser.add_option( - "-s", - "--seed", - action="store", - dest="seed", - default=1024, - help="Set random seed.") - return parser.parse_args() - - -def main(): - reload(sys) - sys.setdefaultencoding('utf-8') - options, args = option_parser() - data = options.input - seed = options.seed - data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt') - data_dir = os.path.join(os.path.dirname(data), 'tmp') - pool = Pool(processes=num_tokenize + 2) - pool.apply_async(parse_batch, args=(data, num_tokenize)) - for i in range(num_tokenize): - pool.apply_async(tokenize_batch, args=(str(i), )) - pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict)) - pool.close() - pool.join() - - file(os.path.join(os.path.dirname(data), 'labels.list'), - 'w').write('neg\t0\npos\t1\n') - - -if __name__ == '__main__': - main() diff --git a/v1_api_demo/quick_start/dataprovider_bow.py b/v1_api_demo/quick_start/dataprovider_bow.py deleted file mode 100644 index 2745495586449b5d1eb64ae570f73eb6b14dbdfe..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/dataprovider_bow.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -# id of the word not in dictionary -UNK_IDX = 0 - - -# initializer is called by the framework during initialization. -# It allows the user to describe the data types and setup the -# necessary data structure for later use. -# `settings` is an object. initializer need to properly fill settings.input_types. -# initializer can also store other data structures needed to be used at process(). -# In this example, dictionary is stored in settings. -# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py -def initializer(settings, dictionary, **kwargs): - # Put the word dictionary into settings - settings.word_dict = dictionary - - # setting.input_types specifies what the data types the data provider - # generates. - settings.input_types = { - # The first input is a sparse_binary_vector, - # which means each dimension of the vector is either 0 or 1. It is the - # bag-of-words (BOW) representation of the texts. - 'word': sparse_binary_vector(len(dictionary)), - # The second input is an integer. It represents the category id of the - # sample. 2 means there are two labels in the dataset. - # (1 for positive and 0 for negative) - 'label': integer_value(2) - } - - -# Delaring a data provider. It has an initializer 'data_initialzer'. -# It will cache the generated data of the first pass in memory, so that -# during later pass, no on-the-fly data generation will be needed. -# `setting` is the same object used by initializer() -# `file_name` is the name of a file listed train_list or test_list file given -# to define_py_data_sources2(). See trainer_config.lr.py. -@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - # Open the input data file. - with open(file_name, 'r') as f: - # Read each line. - for line in f: - # Each line contains the label and text of the comment, separated by \t. - label, comment = line.strip().split('\t') - - # Split the words into a list. - words = comment.split() - - # convert the words into a list of ids by looking them up in word_dict. - word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words] - - # Return the features for the current comment. The first is a list - # of ids representing a 0-1 binary sparse vector of the text, - # the second is the integer id of the label. - yield {'word': word_vector, 'label': int(label)} - - -def predict_initializer(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = {'word': sparse_binary_vector(len(dictionary))} - - -# Declaring a data provider for prediction. The difference with process -# is that label is not generated. -@provider(init_hook=predict_initializer, should_shuffle=False) -def process_predict(settings, file_name): - with open(file_name, 'r') as f: - for line in f: - comment = line.strip().split() - word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield {'word': word_vector} diff --git a/v1_api_demo/quick_start/dataprovider_emb.py b/v1_api_demo/quick_start/dataprovider_emb.py deleted file mode 100755 index ddfa3ce9b73555cb3b7f5a44314ca35b12d41ede..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/dataprovider_emb.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 0 - - -def initializer(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = { - # Define the type of the first input as sequence of integer. - # The value of the integers range from 0 to len(dictrionary)-1 - 'word': integer_value_sequence(len(dictionary)), - # Define the second input for label id - 'label': integer_value(2) - } - - -@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as f: - for line in f: - label, comment = line.strip().split('\t') - words = comment.split() - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - yield {'word': word_slot, 'label': int(label)} - - -def predict_initializer(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = {'word': integer_value_sequence(len(dictionary))} - - -@provider(init_hook=predict_initializer, should_shuffle=False) -def process_predict(settings, file_name): - with open(file_name, 'r') as f: - for line in f: - comment = line.strip().split() - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield {'word': word_slot} diff --git a/v1_api_demo/quick_start/predict.sh b/v1_api_demo/quick_start/predict.sh deleted file mode 100755 index e47c2dd01fb5c919203964e298018e6dc2bd366e..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/predict.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -model="output/pass-00003" -paddle train \ - --config=$cfg \ - --use_gpu=false \ - --job=test \ - --init_model_path=$model \ - --config_args=is_predict=1 \ - --predict_output_dir=. \ -2>&1 | tee 'predict.log' -paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1 - -mv rank-00000 result.txt diff --git a/v1_api_demo/quick_start/train.sh b/v1_api_demo/quick_start/train.sh deleted file mode 100755 index 01697fed48054be8ad98a01d4cbb5029e6a1ead0..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/train.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -#cfg=trainer_config.bidi-lstm.py -#cfg=trainer_config.db-lstm.py -#cfg=trainer_config.resnet-lstm.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=100 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1 diff --git a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py deleted file mode 100644 index 3deff4aa00b1ea5d66097514867d1a392393a523..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -bias_attr = ParamAttr(initial_std=0., l2_rate=0.) -data = data_layer(name="word", size=len(word_dict)) -emb = embedding_layer(input=data, size=128) - -bi_lstm = bidirectional_lstm(input=emb, size=128) -dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) - -output = fc_layer( - input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation()) - -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/quick_start/trainer_config.cnn.py b/v1_api_demo/quick_start/trainer_config.cnn.py deleted file mode 100644 index e09e41484d30db385a1d276b7f346b444fe79d3d..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.cnn.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -data = data_layer(name="word", size=len(word_dict)) -embedding = embedding_layer(input=data, size=128) -conv = sequence_conv_pool(input=embedding, context_len=3, hidden_size=512) -output = fc_layer(input=conv, size=2, act=SoftmaxActivation()) -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/quick_start/trainer_config.db-lstm.py b/v1_api_demo/quick_start/trainer_config.db-lstm.py deleted file mode 100644 index fba802b4600b33cfbfd0820cce1f47e4d0f948ae..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.db-lstm.py +++ /dev/null @@ -1,74 +0,0 @@ -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -bias_attr = ParamAttr(initial_std=0., l2_rate=0.) - -data = data_layer(name="word", size=len(word_dict)) -emb = embedding_layer(input=data, size=128) - -hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)]) -lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1)) - -input_layers = [hidden_0, lstm_0] - -for i in range(1, 8): - fc = fc_layer(input=input_layers, size=128) - lstm = lstmemory( - input=fc, - layer_attr=ExtraAttr(drop_rate=0.1), - reverse=(i % 2) == 1, ) - input_layers = [fc, lstm] - -lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling()) - -output = fc_layer( - input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation()) - -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/quick_start/trainer_config.emb.py b/v1_api_demo/quick_start/trainer_config.emb.py deleted file mode 100644 index f69f98ff7fc885d3fe16d3aaf66967389b3b3240..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.emb.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer()) - -data = data_layer(name="word", size=len(word_dict)) -embedding = embedding_layer(input=data, size=128) -avg = pooling_layer(input=embedding, pooling_type=AvgPooling()) -output = fc_layer(input=avg, size=2, act=SoftmaxActivation()) -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/quick_start/trainer_config.lr.py b/v1_api_demo/quick_start/trainer_config.lr.py deleted file mode 100644 index b7b694940e338acbc40ffd3e5597f209bf07488f..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.lr.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = get_config_arg('dict_file', str, "./data/dict.txt") -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' - -# define the data sources for the model. -# We need to use different process for training and prediction. -# For training, the input data includes both word IDs and labels. -# For prediction, the input data only includs word Ids. -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_bow", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -# Define the data for text features. The size of the data layer is the number -# of words in the dictionary. -data = data_layer(name="word", size=len(word_dict)) - -# Define a fully connected layer with logistic activation. -# (also called softmax activation). -output = fc_layer(input=data, size=2, act=SoftmaxActivation()) - -if not is_predict: - # For training, we need label and cost - - # define the category id for each example. - # The size of the data layer is the number of labels. - label = data_layer(name="label", size=2) - - # Define cross-entropy classification loss and error. - cls = classification_cost(input=output, label=label) - outputs(cls) -else: - # For prediction, no label is needed. We need to output - # We need to output classification result, and class probabilities. - maxid = maxid_layer(output) - outputs([maxid, output]) diff --git a/v1_api_demo/quick_start/trainer_config.lstm.py b/v1_api_demo/quick_start/trainer_config.lstm.py deleted file mode 100644 index 8967d78807b9bbf990f5dd36240c18199b86954e..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.lstm.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -data = data_layer(name="word", size=len(word_dict)) -emb = embedding_layer(input=data, size=128) -lstm = simple_lstm( - input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25)) -lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling()) -output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation()) -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py deleted file mode 100644 index 32d0596f250c0f0c5a4004d3af7adb794b3f0f1b..0000000000000000000000000000000000000000 --- a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This configuration is a demonstration of how to implement the stacked LSTM -with residual connections, i.e. an LSTM layer takes the sum of the hidden states -and inputs of the previous LSTM layer instead of only the hidden states. -This architecture is from: -Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, -Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, -Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, -Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, -George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, -Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016. -Google's Neural Machine Translation System: Bridging the Gap between Human and -Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf -Different from the architecture described in the paper, we use a stack single -direction LSTM layers as the first layer instead of bi-directional LSTM. Also, -since this is a demo code, to reduce computation time, we stacked 4 layers -instead of 8 layers. -""" - -from paddle.trainer_config_helpers import * - -dict_file = "./data/dict.txt" -word_dict = dict() -with open(dict_file, 'r') as f: - for i, line in enumerate(f): - w = line.strip().split()[0] - word_dict[w] = i - -is_predict = get_config_arg('is_predict', bool, False) -trn = 'data/train.list' if not is_predict else None -tst = 'data/test.list' if not is_predict else 'data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, - test_list=tst, - module="dataprovider_emb", - obj=process, - args={"dictionary": word_dict}) - -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -bias_attr = ParamAttr(initial_std=0., l2_rate=0.) - -data = data_layer(name="word", size=len(word_dict)) -emb = embedding_layer(input=data, size=128) -lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1)) - -previous_input, previous_hidden_state = emb, lstm - -for i in range(3): - # The input to the current layer is the sum of the hidden state - # and input of the previous layer. - current_input = addto_layer(input=[previous_input, previous_hidden_state]) - hidden_state = simple_lstm( - input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1)) - previous_input, previous_hidden_state = current_input, hidden_state - -lstm = previous_hidden_state - -lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling()) -output = fc_layer( - input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation()) - -if is_predict: - maxid = maxid_layer(output) - outputs([maxid, output]) -else: - label = data_layer(name="label", size=2) - cls = classification_cost(input=output, label=label) - outputs(cls) diff --git a/v1_api_demo/sequence_tagging/data/get_data.sh b/v1_api_demo/sequence_tagging/data/get_data.sh deleted file mode 100755 index 0cdb394035e782b3a647f7f13e79d55b5d3dff48..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/data/get_data.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz -wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz diff --git a/v1_api_demo/sequence_tagging/data/test.list b/v1_api_demo/sequence_tagging/data/test.list deleted file mode 100644 index 073c0a0c9063ac55f762ac261746aa73057d70e8..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/data/test.list +++ /dev/null @@ -1 +0,0 @@ -data/test.txt.gz diff --git a/v1_api_demo/sequence_tagging/data/train.list b/v1_api_demo/sequence_tagging/data/train.list deleted file mode 100644 index 43c24d5f6484a90fe883ad5516fe100d27c9ce47..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/data/train.list +++ /dev/null @@ -1 +0,0 @@ -data/train.txt.gz diff --git a/v1_api_demo/sequence_tagging/dataprovider.py b/v1_api_demo/sequence_tagging/dataprovider.py deleted file mode 100644 index bb4b4465bc7e032c50c1d21263651e2578af67be..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/dataprovider.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import gzip -import logging - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', ) -logger = logging.getLogger('paddle') -logger.setLevel(logging.INFO) - -OOV_POLICY_IGNORE = 0 -OOV_POLICY_USE = 1 -OOV_POLICY_ERROR = 2 - -num_original_columns = 3 - -# Feature combination patterns. -# [[-1,0], [0,0]] means previous token at column 0 and current token at -# column 0 are combined as one feature. -patterns = [ - [[-2, 0]], - [[-1, 0]], - [[0, 0]], - [[1, 0]], - [[2, 0]], - [[-1, 0], [0, 0]], - [[0, 0], [1, 0]], - [[-2, 1]], - [[-1, 1]], - [[0, 1]], - [[1, 1]], - [[2, 1]], - [[-2, 1], [-1, 1]], - [[-1, 1], [0, 1]], - [[0, 1], [1, 1]], - [[1, 1], [2, 1]], - [[-2, 1], [-1, 1], [0, 1]], - [[-1, 1], [0, 1], [1, 1]], - [[0, 1], [1, 1], [2, 1]], -] - -dict_label = { - 'B-ADJP': 0, - 'I-ADJP': 1, - 'B-ADVP': 2, - 'I-ADVP': 3, - 'B-CONJP': 4, - 'I-CONJP': 5, - 'B-INTJ': 6, - 'I-INTJ': 7, - 'B-LST': 8, - 'I-LST': 9, - 'B-NP': 10, - 'I-NP': 11, - 'B-PP': 12, - 'I-PP': 13, - 'B-PRT': 14, - 'I-PRT': 15, - 'B-SBAR': 16, - 'I-SBAR': 17, - 'B-UCP': 18, - 'I-UCP': 19, - 'B-VP': 20, - 'I-VP': 21, - 'O': 22 -} - - -def make_features(sequence): - length = len(sequence) - num_features = len(sequence[0]) - - def get_features(pos): - if pos < 0: - return ['#B%s' % -pos] * num_features - if pos >= length: - return ['#E%s' % (pos - length + 1)] * num_features - return sequence[pos] - - for i in xrange(length): - for pattern in patterns: - fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern]) - sequence[i].append(fname) - - -''' -Source file format: -Each line is for one timestep. The features are separated by space. -An empty line indicates end of a sequence. - -cutoff: a list of numbers. If count of a feature is smaller than this, - it will be ignored. -if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of -i-th column. - -return a list of dict for each column -''' - - -def create_dictionaries(filename, cutoff, oov_policy): - def add_to_dict(sequence, dicts): - num_features = len(dicts) - for features in sequence: - l = len(features) - assert l == num_features, "Wrong number of features " + line - for i in xrange(l): - if features[i] in dicts[i]: - dicts[i][features[i]] += 1 - else: - dicts[i][features[i]] = 1 - - num_features = len(cutoff) - dicts = [] - for i in xrange(num_features): - dicts.append(dict()) - - f = gzip.open(filename, 'rb') - - sequence = [] - - for line in f: - line = line.strip() - if not line: - make_features(sequence) - add_to_dict(sequence, dicts) - sequence = [] - continue - features = line.split(' ') - sequence.append(features) - - for i in xrange(num_features): - dct = dicts[i] - n = 1 if oov_policy[i] == OOV_POLICY_USE else 0 - todo = [] - for k, v in dct.iteritems(): - if v < cutoff[i]: - todo.append(k) - else: - dct[k] = n - n += 1 - - if oov_policy[i] == OOV_POLICY_USE: - # placeholder so that len(dct) will be the number of features - # including OOV - dct['#OOV#'] = 0 - - logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo))) - for k in todo: - del dct[k] - - f.close() - return dicts - - -def initializer(settings, **xargs): - cutoff = [3, 1, 0] - cutoff += [3] * len(patterns) - oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR] - oov_policy += [OOV_POLICY_IGNORE] * len(patterns) - dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy) - dicts[2] = dict_label - settings.dicts = dicts - settings.oov_policy = oov_policy - input_types = [] - num_features = len(dicts) - for i in xrange(num_original_columns): - input_types.append(integer_sequence(len(dicts[i]))) - logger.info("slot %s size=%s" % (i, len(dicts[i]))) - if patterns: - dim = 0 - for i in xrange(num_original_columns, num_features): - dim += len(dicts[i]) - input_types.append(sparse_binary_vector_sequence(dim)) - logger.info("feature size=%s" % dim) - settings.input_types = input_types - - -''' -if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not -existed in dicts[i] will be assigned to id 0. -if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist -in dicts[i]. -''' - - -@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, filename): - input_file = filename - dicts = settings.dicts - oov_policy = settings.oov_policy - - def gen_sample(sequence): - num_features = len(dicts) - sample = [list() for i in xrange(num_original_columns)] - if patterns: - sample.append([]) - for features in sequence: - assert len(features) == num_features, \ - "Wrong number of features: " + line - for i in xrange(num_original_columns): - id = dicts[i].get(features[i], -1) - if id != -1: - sample[i].append(id) - elif oov_policy[i] == OOV_POLICY_IGNORE: - sample[i].append(0xffffffff) - elif oov_policy[i] == OOV_POLICY_ERROR: - logger.fatal("Unknown token: %s" % features[i]) - else: - sample[i].append(0) - - if patterns: - dim = 0 - vec = [] - for i in xrange(num_original_columns, num_features): - id = dicts[i].get(features[i], -1) - if id != -1: - vec.append(dim + id) - elif oov_policy[i] == OOV_POLICY_IGNORE: - pass - elif oov_policy[i] == OOV_POLICY_ERROR: - logger.fatal("Unknown token: %s" % features[i]) - else: - vec.ids.append(dim + 0) - - dim += len(dicts[i]) - sample[-1].append(vec) - return sample - - num_features = len(dicts) - f = gzip.open(input_file, 'rb') - - num_sequences = 0 - sequence = [] - for line in f: - line = line.strip() - if not line: - make_features(sequence) - yield gen_sample(sequence) - sequence = [] - num_sequences += 1 - continue - features = line.split(' ') - sequence.append(features) - - f.close() - - logger.info("num_sequences=%s" % num_sequences) diff --git a/v1_api_demo/sequence_tagging/linear_crf.py b/v1_api_demo/sequence_tagging/linear_crf.py deleted file mode 100644 index ea012ba1ae9c790ccefd3dd5f066aa92202128a2..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/linear_crf.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -import math - -define_py_data_sources2( - train_list="data/train.list", - test_list="data/test.list", - module="dataprovider", - obj="process") - -batch_size = 1 -settings( - learning_method=MomentumOptimizer(), - batch_size=batch_size, - regularization=L2Regularization(batch_size * 1e-4), - model_average=ModelAverage(0.5), - learning_rate=1e-1, - learning_rate_decay_a=1e-5, - learning_rate_decay_b=0.25, ) - -num_label_types = 23 - - -def get_simd_size(size): - return int(math.ceil(float(size) / 8)) * 8 - - -# Currently, in order to use sparse_update=True, -# the size has to be aligned. -num_label_types = get_simd_size(num_label_types) - -features = data_layer(name="features", size=76328) -word = data_layer(name="word", size=6778) -pos = data_layer(name="pos", size=44) -chunk = data_layer(name="chunk", size=num_label_types) - -crf_input = fc_layer( - input=features, - size=num_label_types, - act=LinearActivation(), - bias_attr=False, - param_attr=ParamAttr( - initial_std=0, sparse_update=True)) - -crf = crf_layer( - input=crf_input, - label=chunk, - param_attr=ParamAttr( - name="crfw", initial_std=0), ) - -crf_decoding = crf_decoding_layer( - size=num_label_types, - input=crf_input, - label=chunk, - param_attr=ParamAttr(name="crfw"), ) - -sum_evaluator( - name="error", - input=crf_decoding, ) - -chunk_evaluator( - name="chunk_f1", - input=crf_decoding, - label=chunk, - chunk_scheme="IOB", - num_chunk_types=11, ) - -inputs(word, pos, chunk, features) -outputs(crf) diff --git a/v1_api_demo/sequence_tagging/readme.md b/v1_api_demo/sequence_tagging/readme.md deleted file mode 100644 index 2e17fffb83c532f5e5fec1227f169c97c1f20e22..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/readme.md +++ /dev/null @@ -1,45 +0,0 @@ -# Sequence Tagging - -This demo is a sequence model for assigning tags to each token in a sentence. The task is described at CONLL2000 Text Chunking task. - -## Download data -```bash -cd demo/sequence_tagging -./data/get_data.sh -``` - -## Train model -```bash -cd demo/sequence_tagging -./train.sh -``` - -## Model description - -We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at leon.bottou.org/projects/sgd. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py). -
- - - - - - - - - - - - - - - - - - - - - - -
Model nameNumber of parametersF1 score
linear_crf 1.8M 0.937
rnn_crf 960K 0.941
-
-
diff --git a/v1_api_demo/sequence_tagging/rnn_crf.py b/v1_api_demo/sequence_tagging/rnn_crf.py deleted file mode 100644 index 937a34df103663ecf0f0827bbfb9d82823c9b902..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/rnn_crf.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -import math - -define_py_data_sources2( - train_list="data/train.list", - test_list="data/test.list", - module="dataprovider", - obj="process") - -batch_size = 16 -settings( - learning_method=MomentumOptimizer(), - batch_size=batch_size, - regularization=L2Regularization(batch_size * 1e-5), - model_average=ModelAverage(0.5), - learning_rate=2e-3, - learning_rate_decay_a=5e-7, - learning_rate_decay_b=0.5, ) - -word_dim = 128 -hidden_dim = 128 -with_rnn = True - -initial_std = 1 / math.sqrt(hidden_dim) -param_attr = ParamAttr(initial_std=initial_std) -cpu_layer_attr = ExtraLayerAttribute(device=-1) - -default_device(0) - -num_label_types = 23 - -features = data_layer(name="features", size=76328) -word = data_layer(name="word", size=6778) -pos = data_layer(name="pos", size=44) -chunk = data_layer( - name="chunk", size=num_label_types, layer_attr=cpu_layer_attr) - -emb = embedding_layer( - input=word, size=word_dim, param_attr=ParamAttr(initial_std=0)) - -hidden1 = mixed_layer( - size=hidden_dim, - act=STanhActivation(), - bias_attr=True, - input=[ - full_matrix_projection(emb), table_projection( - pos, param_attr=param_attr) - ]) - -if with_rnn: - rnn1 = recurrent_layer( - act=ReluActivation(), - bias_attr=True, - input=hidden1, - param_attr=ParamAttr(initial_std=0), ) - -hidden2 = mixed_layer( - size=hidden_dim, - act=STanhActivation(), - bias_attr=True, - input=[full_matrix_projection(hidden1)] + - ([full_matrix_projection( - rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), ) - -if with_rnn: - rnn2 = recurrent_layer( - reverse=True, - act=ReluActivation(), - bias_attr=True, - input=hidden2, - param_attr=ParamAttr(initial_std=0), ) - -crf_input = mixed_layer( - size=num_label_types, - bias_attr=False, - input=[full_matrix_projection(hidden2), ] + - ([full_matrix_projection( - rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), ) - -crf = crf_layer( - input=crf_input, - label=chunk, - param_attr=ParamAttr( - name="crfw", initial_std=0), - layer_attr=cpu_layer_attr, ) - -crf_decoding = crf_decoding_layer( - size=num_label_types, - input=crf_input, - label=chunk, - param_attr=ParamAttr(name="crfw"), - layer_attr=cpu_layer_attr, ) - -sum_evaluator( - name="error", - input=crf_decoding, ) - -chunk_evaluator( - name="chunk_f1", - input=crf_decoding, - label=chunk, - chunk_scheme="IOB", - num_chunk_types=11, ) - -inputs(word, pos, chunk, features) -outputs(crf) diff --git a/v1_api_demo/sequence_tagging/train.sh b/v1_api_demo/sequence_tagging/train.sh deleted file mode 100755 index 37e196c84200dc26ccb523076a81dbc393b1280f..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -paddle train \ - --config rnn_crf.py \ - --parallel_nn=1 \ - --use_gpu=1 \ - --dot_period=10 \ - --log_period=1000 \ - --test_period=0 \ - --num_passes=10 \ -2>&1 | tee 'train.log' -paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1 diff --git a/v1_api_demo/sequence_tagging/train_linear.sh b/v1_api_demo/sequence_tagging/train_linear.sh deleted file mode 100755 index ad6e2d8ee7f813c69f9dd250c6f7bbb4403a0ed5..0000000000000000000000000000000000000000 --- a/v1_api_demo/sequence_tagging/train_linear.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -paddle train \ - --config linear_crf.py \ - --use_gpu=0 \ - --dot_period=100 \ - --log_period=10000 \ - --test_period=0 \ - --num_passes=10 -2>&1 | tee 'train_linear.log' -paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1 diff --git a/v1_api_demo/traffic_prediction/README b/v1_api_demo/traffic_prediction/README deleted file mode 100644 index 4c95188583513c332b7d7cb0a32d59336208e1aa..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/README +++ /dev/null @@ -1,7 +0,0 @@ -run by: -cd ./data -sh get_data.sh -cd .. -sh train.sh -sh predict.sh - diff --git a/v1_api_demo/traffic_prediction/data/get_data.sh b/v1_api_demo/traffic_prediction/data/get_data.sh deleted file mode 100755 index f2fa548d4709c0361334f117bfb49e18d83c32f4..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/data/get_data.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -set -x - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -#download the dataset -echo "Downloading traffic data..." -wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz - -#extract package -echo "Unzipping..." -tar -zxvf traffic_data.tar.gz - -echo "data/speeds.csv" > train.list -echo "data/speeds.csv" > test.list -echo "data/speeds.csv" > pred.list - -echo "Done." diff --git a/v1_api_demo/traffic_prediction/dataprovider.py b/v1_api_demo/traffic_prediction/dataprovider.py deleted file mode 100644 index c7883b6950c369ee67c39b80ce1cefbbf9350459..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/dataprovider.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import sys -import numpy as np -TERM_NUM = 24 -FORECASTING_NUM = 24 -LABEL_VALUE_NUM = 4 - - -def initHook(settings, file_list, **kwargs): - """ - Init hook is invoked before process data. It will set obj.slots and store data meta. - - :param settings: global object. It will passed to process routine. - :type obj: object - :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function. - :param kwargs: unused other arguments. - """ - del kwargs #unused - - settings.pool_size = sys.maxint - #Use a time seires of the past as feature. - #Dense_vector's expression form is [float,float,...,float] - settings.input_types = [dense_vector(TERM_NUM)] - #There are next FORECASTING_NUM fragments you need predict. - #Every predicted condition at time point has four states. - for i in range(FORECASTING_NUM): - settings.input_types.append(integer_value(LABEL_VALUE_NUM)) - - -@provider( - init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True) -def process(settings, file_name): - with open(file_name) as f: - #abandon fields name - f.next() - for row_num, line in enumerate(f): - speeds = map(int, line.rstrip('\r\n').split(",")[1:]) - # Get the max index. - end_time = len(speeds) - # Scanning and generating samples - for i in range(TERM_NUM, end_time - FORECASTING_NUM): - # For dense slot - pre_spd = map(float, speeds[i - TERM_NUM:i]) - - # Integer value need predicting, values start from 0, so every one minus 1. - fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]] - - # Predicting label is missing, abandon the sample. - if -1 in fol_spd: - continue - yield [pre_spd] + fol_spd - - -def predict_initHook(settings, file_list, **kwargs): - settings.pool_size = sys.maxint - settings.input_types = [dense_vector(TERM_NUM)] - - -@provider(init_hook=predict_initHook, should_shuffle=False) -def process_predict(settings, file_name): - with open(file_name) as f: - #abandon fields name - f.next() - for row_num, line in enumerate(f): - speeds = map(int, line.rstrip('\r\n').split(",")) - end_time = len(speeds) - pre_spd = map(float, speeds[end_time - TERM_NUM:end_time]) - yield pre_spd diff --git a/v1_api_demo/traffic_prediction/gen_result.py b/v1_api_demo/traffic_prediction/gen_result.py deleted file mode 100644 index 3da70b30315f863fd3582583e9a29540a09c1e7f..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/gen_result.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -res = [] -with open('./rank-00000') as f: - for line in f: - pred = map(int, line.strip('\r\n;').split(";")) - #raw prediction range from 0 to 3 - res.append([i + 1 for i in pred]) - -file_name = open('./data/pred.list').read().strip('\r\n') - -FORECASTING_NUM = 24 -header = [ - 'id', - '201604200805', - '201604200810', - '201604200815', - '201604200820', - '201604200825', - '201604200830', - '201604200835', - '201604200840', - '201604200845', - '201604200850', - '201604200855', - '201604200900', - '201604200905', - '201604200910', - '201604200915', - '201604200920', - '201604200925', - '201604200930', - '201604200935', - '201604200940', - '201604200945', - '201604200950', - '201604200955', - '201604201000', -] -################### -## To CSV format ## -################### -with open(file_name) as f: - f.next() - print ','.join(header) - for row_num, line in enumerate(f): - fields = line.rstrip('\r\n').split(',') - linkid = fields[0] - print linkid + ',' + ','.join(map(str, res[row_num])) diff --git a/v1_api_demo/traffic_prediction/predict.sh b/v1_api_demo/traffic_prediction/predict.sh deleted file mode 100755 index 2dbd5e8805dd97d35c7d58917f8ec6b5033bda03..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/predict.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -cfg=trainer_config.py -# pass choice -model="output/pass-00000" -paddle train \ - --config=$cfg \ - --use_gpu=false \ - --job=test \ - --init_model_path=$model \ - --config_args=is_predict=1 \ - --predict_output_dir=. - -python gen_result.py > result.csv - -rm -rf rank-00000 diff --git a/v1_api_demo/traffic_prediction/train.sh b/v1_api_demo/traffic_prediction/train.sh deleted file mode 100755 index 48dfc5604f80042598c5c779bd450a5808fdfb64..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/train.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -cfg=trainer_config.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=1000 \ - --dot_period=10 \ - --num_passes=10 \ - --use_gpu=false \ - --show_parameter_stats_period=3000 \ - 2>&1 | tee 'train.log' diff --git a/v1_api_demo/traffic_prediction/trainer_config.py b/v1_api_demo/traffic_prediction/trainer_config.py deleted file mode 100755 index 52d678624aff7ca2264c3c20e320004217d14397..0000000000000000000000000000000000000000 --- a/v1_api_demo/traffic_prediction/trainer_config.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer_config_helpers import * - -################################### DATA Configuration ############################################# -is_predict = get_config_arg('is_predict', bool, False) -trn = './data/train.list' if not is_predict else None -tst = './data/test.list' if not is_predict else './data/pred.list' -process = 'process' if not is_predict else 'process_predict' -define_py_data_sources2( - train_list=trn, test_list=tst, module="dataprovider", obj=process) -################################### Parameter Configuaration ####################################### -TERM_NUM = 24 -FORECASTING_NUM = 24 -emb_size = 16 -batch_size = 128 if not is_predict else 1 -settings( - batch_size=batch_size, - learning_rate=1e-3, - learning_method=RMSPropOptimizer()) -################################### Algorithm Configuration ######################################## - -output_label = [] - -link_encode = data_layer(name='link_encode', size=TERM_NUM) -for i in xrange(FORECASTING_NUM): - # Each task share same weight. - link_param = ParamAttr( - name='_link_vec.w', initial_max=1.0, initial_min=-1.0) - link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param) - score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation()) - if is_predict: - maxid = maxid_layer(score) - output_label.append(maxid) - else: - # Multi-task training. - label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4) - cls = classification_cost( - input=score, name="cost_%dmin" % ((i + 1) * 5), label=label) - output_label.append(cls) -outputs(output_label) diff --git a/v1_api_demo/vae/README.md b/v1_api_demo/vae/README.md deleted file mode 100644 index e55d483b023773900729622a6cac44116fc79c76..0000000000000000000000000000000000000000 --- a/v1_api_demo/vae/README.md +++ /dev/null @@ -1,13 +0,0 @@ -#Variational Autoencoder (VAE) - -This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114). - - -In order to run the model, first download the MNIST dataset by running the shell script in ./data. - -Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu). - -$python vae_train.py [--use_gpu 1] - -The generated images will be stored in ./samples/ -The corresponding models will be stored in ./params/ diff --git a/v1_api_demo/vae/data/get_mnist_data.sh b/v1_api_demo/vae/data/get_mnist_data.sh deleted file mode 100755 index a77c81bf5af9ddb6634ff89460797ca543c5e517..0000000000000000000000000000000000000000 --- a/v1_api_demo/vae/data/get_mnist_data.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env sh -# This script downloads the mnist data and unzips it. -set -e -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -rm -rf "$DIR/mnist_data" -mkdir "$DIR/mnist_data" -cd "$DIR/mnist_data" - -echo "Downloading..." - -for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte -do - if [ ! -e $fname ]; then - wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz - gunzip ${fname}.gz - fi -done diff --git a/v1_api_demo/vae/dataloader.py b/v1_api_demo/vae/dataloader.py deleted file mode 100644 index e9ff95d44f825cd941b5687f754618e66d491e7f..0000000000000000000000000000000000000000 --- a/v1_api_demo/vae/dataloader.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -class MNISTloader(): - def __init__(self, - data_path="./data/mnist_data/", - batch_size=60, - process='train'): - self.batch_size = batch_size - self.data_path = data_path - self._pointer = 0 - self.image_batches = np.array([]) - self.process = process - - def _extract_images(self, filename, n): - f = open(filename, 'rb') - f.read(16) - data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)) - #Mapping data into [-1, 1] - data = data / 255. * 2. - 1 - data_batches = np.split(data, 60000 / self.batch_size, 0) - - f.close() - - return data_batches - - @property - def pointer(self): - return self._pointer - - def load_data(self): - TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path - TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path - - if self.process == 'train': - self.image_batches = self._extract_images(TRAIN_IMAGES, 60000) - else: - self.image_batches = self._extract_images(TEST_IMAGES, 10000) - - def next_batch(self): - batch = self.image_batches[self._pointer] - self._pointer = (self._pointer + 1) % (60000 / self.batch_size) - return np.array(batch) - - def reset_pointer(self): - self._pointer = 0 diff --git a/v1_api_demo/vae/vae_conf.py b/v1_api_demo/vae/vae_conf.py deleted file mode 100644 index 301dd23793d19ec5946cc7bb07e32c53c04a972b..0000000000000000000000000000000000000000 --- a/v1_api_demo/vae/vae_conf.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * -import numpy as np - -is_generating = get_config_arg("is_generating", bool, False) - -settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer()) - -X_dim = 28 * 28 -h_dim = 128 -z_dim = 100 - - -def reparameterization(mu, logvar): - eps = ParamAttr(initial_mean=0., initial_std=1) - with mixed_layer() as sigma: - sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps) - return mu + sigma - - -def q_func(X): - """ - xavier initialization - """ - param_attr = ParamAttr( - name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.)) - mu_param = ParamAttr( - name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.)) - logvar_param = ParamAttr( - name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.)) - - bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.) - mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.) - logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.) - - share_layer = fc_layer( - X, - size=h_dim, - param_attr=param_attr, - bias_attr=bias_attr, - act=ReluActivation()) - - return (fc_layer( - share_layer, - size=z_dim, - param_attr=mu_param, - bias_attr=mu_bias, - act=LinearActivation()), fc_layer( - share_layer, - size=z_dim, - param_attr=logvar_param, - bias_attr=logvar_bias, - act=LinearActivation())) - - -def generator(z): - - hidden_param = ParamAttr( - name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.)) - hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.) - prob_param = ParamAttr( - name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.)) - prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.) - - hidden_layer = fc_layer( - z, - size=h_dim, - act=ReluActivation(), - param_attr=hidden_param, - bias_attr=hidden_bias) - prob = fc_layer( - hidden_layer, - size=X_dim, - act=SigmoidActivation(), - param_attr=prob_param, - bias_attr=prob_bias) - - return prob - - -def reconstruct_error(prob, X): - cost = multi_binary_label_cross_entropy(input=prob, label=X) - return cost - - -def KL_loss(mu, logvar): - with mixed_layer() as mu_square: - mu_square += dotmul_operator(mu, mu, scale=1.) - - cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar) - - return cost - - -if not is_generating: - x_batch = data_layer(name='x_batch', size=X_dim) - mu, logvar = q_func(x_batch) - z_samples = reparameterization(mu, logvar) - prob = generator(z_samples) - outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar)) -else: - z_samples = data_layer(name='noise', size=z_dim) - outputs(generator(z_samples)) diff --git a/v1_api_demo/vae/vae_train.py b/v1_api_demo/vae/vae_train.py deleted file mode 100644 index 1babb011c77b92861cc680a2e1aaa8c9ae5d97b5..0000000000000000000000000000000000000000 --- a/v1_api_demo/vae/vae_train.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import random -import numpy as np -import cPickle -import sys, os -from PIL import Image - -from paddle.trainer.config_parser import parse_config -from paddle.trainer.config_parser import logger -import py_paddle.swig_paddle as api -import dataloader -import matplotlib.pyplot as plt - - -def plot_samples(samples): - fig = plt.figure(figsize=(4, 4)) - gs = gridspec.GridSpec(4, 4) - gs.update(wspace=0.05, hspace=0.05) - for i, sample in enumerate(samples): - plt.subplot(gs[i]) - plt.axis('off') - plt.imshow(sample.reshape(28, 28), cmap='Greys_r') - - return fig - - -def CHECK_EQ(a, b): - assert a == b, "a=%s, b=%s" % (a, b) - - -def get_fake_samples(generator_machine, batch_size, noise): - gen_inputs = api.Arguments.createArguments(1) - gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise)) - gen_outputs = api.Arguments.createArguments(0) - generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST) - fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat() - return fake_samples - - -def copy_shared_parameters(src, dst): - ''' - copy the parameters from src to dst - :param src: the source of the parameters - :type src: GradientMachine - :param dst: the destination of the parameters - :type dst: GradientMachine - ''' - src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())] - src_params = dict([(p.getName(), p) for p in src_params]) - - for i in xrange(dst.getParameterSize()): - dst_param = dst.getParameter(i) - src_param = src_params.get(dst_param.getName(), None) - if src_param is None: - continue - src_value = src_param.getBuf(api.PARAMETER_VALUE) - dst_value = dst_param.getBuf(api.PARAMETER_VALUE) - CHECK_EQ(len(src_value), len(dst_value)) - dst_value.copyFrom(src_value) - dst_param.setValueUpdated() - - -def find(iterable, cond): - for item in iterable: - if cond(item): - return item - return None - - -def get_layer_size(model_conf, layer_name): - layer_conf = find(model_conf.layers, lambda x: x.name == layer_name) - assert layer_conf is not None, "Cannot find '%s' layer" % layer_name - return layer_conf.size - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--use_gpu", default="1", help="1 means use gpu for training") - parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter") - args = parser.parse_args() - use_gpu = args.use_gpu - assert use_gpu in ["0", "1"] - - if not os.path.exists("./samples/"): - os.makedirs("./samples/") - - if not os.path.exists("./params/"): - os.makedirs("./params/") - - api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', - '--log_period=1000', '--gpu_id=' + args.gpu_id, - '--save_dir=' + "./params/") - - conf = "vae_conf.py" - - trainer_conf = parse_config(conf, "is_generating=False") - gener_conf = parse_config(conf, "is_generating=True") - - batch_size = trainer_conf.opt_config.batch_size - - noise_dim = get_layer_size(gener_conf.model_config, "noise") - - mnist = dataloader.MNISTloader(batch_size=batch_size) - mnist.load_data() - - training_machine = api.GradientMachine.createFromConfigProto( - trainer_conf.model_config) - - generator_machine = api.GradientMachine.createFromConfigProto( - gener_conf.model_config) - - trainer = api.Trainer.create(trainer_conf, training_machine) - - trainer.startTrain() - - for train_pass in xrange(100): - trainer.startTrainPass() - mnist.reset_pointer() - i = 0 - it = 0 - while mnist.pointer != 0 or i == 0: - X = mnist.next_batch().astype('float32') - - inputs = api.Arguments.createArguments(1) - inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X)) - - trainer.trainOneDataBatch(batch_size, inputs) - - if it % 1000 == 0: - - outputs = api.Arguments.createArguments(0) - training_machine.forward(inputs, outputs, api.PASS_TEST) - loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat()) - print "\niter: {}".format(str(it).zfill(3)) - print "VAE loss: {}".format(str(loss).zfill(3)) - - #Sync parameters between networks (GradientMachine) at the beginning - copy_shared_parameters(training_machine, generator_machine) - - z_samples = np.random.randn(batch_size, - noise_dim).astype('float32') - samples = get_fake_samples(generator_machine, batch_size, - z_samples) - - #Generating the first 16 images for a picture. - figure = plot_samples(samples[:16]) - plt.savefig( - "./samples/{}_{}.png".format( - str(train_pass).zfill(3), str(i).zfill(3)), - bbox_inches='tight') - plt.close(figure) - i += 1 - it += 1 - - trainer.finishTrainPass() - trainer.finishTrain() - - -if __name__ == '__main__': - main()