diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..cc31d098328bc237c018ebf8f158bdab5c37bff1 --- /dev/null +++ b/benchmark/fluid/machine_translation.py @@ -0,0 +1,349 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=True, + help="Whether to use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + pass + + +if __name__ == '__main__': + args = parser.parse_args() + if args.infer_only: + infer() + else: + train() diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7f7afaeb11447d936b65a1d83701b0176ecbc111 --- /dev/null +++ b/benchmark/fluid/mnist.py @@ -0,0 +1,205 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 1 +DTYPE = "float32" + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def eval_test(exe, batch_acc, batch_size_tensor, inference_program): + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + test_pass_acc = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), + data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_pass_acc.add(value=acc, weight=weight) + pass_acc = test_pass_acc.eval() + return pass_acc + + +def run_benchmark(model, args): + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + start_time = time.time() + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + opt.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + pass_start = time.time() + for batch_id, data in enumerate(train_reader()): + img_data = np.array( + map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + start = time.time() + outs = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor] + ) # The accuracy is the accumulation of batches, but not the current batch. + accuracy.add(value=outs[1], weight=outs[2]) + end = time.time() + loss = np.array(outs[0]) + acc = np.array(outs[1]) + print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % + (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + + pass_end = time.time() + + train_avg_acc = accuracy.eval() + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + + print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % + (pass_id, train_avg_acc, test_avg_acc, + (pass_end - pass_start) / 1000)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + run_benchmark(cnn_model, args) + else: + run_benchmark(cnn_model, args) diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f1db979fa7fb640679beacafd66dfbe1f62ab8 --- /dev/null +++ b/benchmark/fluid/resnet.py @@ -0,0 +1,323 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import functools +import numpy as np +import time + +import cProfile, pstats, StringIO + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler + + +def parse_args(): + parser = argparse.ArgumentParser('Convolution model benchmark.') + parser.add_argument( + '--model', + type=str, + choices=['resnet_imagenet', 'resnet_cifar10'], + default='resnet_imagenet', + help='The model architecture.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='use real data or fake data') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return out + + +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + return out + + +def run_benchmark(model, args): + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + + if args.data_set == "cifar10": + class_dim = 10 + if args.data_format == 'NCHW': + dshape = [3, 32, 32] + else: + dshape = [32, 32, 3] + else: + class_dim = 102 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + opts = optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + def test(exe): + test_accuracy = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + acc, weight = exe.run(inference_program, + feed={"data": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) + + return test_accuracy.eval() + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + accuracy = fluid.average.WeightedAverage() + if args.use_fake_data: + data = train_reader().next() + image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( + 'float32') + label = np.array(map(lambda x: x[1], data)).astype('int64') + label = label.reshape([-1, 1]) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + accuracy.reset() + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if not args.use_fake_data: + image = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype('float32') + label = np.array(map(lambda x: x[1], data)).astype('int64') + label = label.reshape([-1, 1]) + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={'data': image, + 'label': label}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + iters += 1 + num_samples += label[0] + accuracy.add(value=acc, weight=weight) + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % + (pass_id, iters, loss, acc)) + pass_train_acc = accuracy.eval() + # evaluation + if args.with_test: + pass_test_acc = test(exe) + train_elapsed = time.time() - start_time + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + examples_per_sec = num_samples / train_elapsed + + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + + if args.use_cprof: + pr.disable() + s = StringIO.StringIO() + sortby = 'cumulative' + ps = pstats.Stats(pr, stream=s).sort_stats(sortby) + ps.print_stats() + print(s.getvalue()) + + +if __name__ == '__main__': + model_map = { + 'resnet_imagenet': resnet_imagenet, + 'resnet_cifar10': resnet_cifar10 + } + args = parse_args() + print_arguments(args) + if args.data_format == 'NHWC': + raise ValueError('Only support NCHW data_format now.') + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + run_benchmark(model_map[args.model], args) + else: + run_benchmark(model_map[args.model], args) diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..663e2efd5392a6cd1a71f51fa0d017070b489341 --- /dev/null +++ b/benchmark/fluid/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# This script benchmarking the PaddlePaddle Fluid on +# single thread single GPU. +export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib + +# disable openmp and mkl parallel +#https://github.com/PaddlePaddle/Paddle/issues/7199 +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi +# disable multi-gpu if have more than one +export CUDA_VISIBLE_DEVICES=0 +export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH + + +# vgg16 +# cifar10 gpu cifar10 128 +FLAGS_benchmark=true python fluid/vgg.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 > vgg16_gpu_128.log + +# resnet50 +# resnet50 gpu cifar10 128 +FLAGS_benchmark=true python fluid/resnet.py \ + --device=GPU \ + --batch_size=128 \ + --data_set=cifar10 \ + --model=resnet_cifar10 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 > resnet50_gpu_128.log + +# lstm diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..4e063549e0239abf9d946ed8735f0306203509d0 --- /dev/null +++ b/benchmark/fluid/stacked_dynamic_lstm.py @@ -0,0 +1,209 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import cPickle +import os +import random +import time + +import numpy +import paddle.v2 as paddle +import paddle.v2.dataset.imdb as imdb +import paddle.fluid as fluid +from paddle.v2 import batch +import paddle.fluid.profiler as profiler + + +def parse_args(): + parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--emb_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=100, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--crop_size', + type=int, + default=int(os.environ.get('CROP_SIZE', '1500')), + help='The max sentence length of input. Since this model use plain RNN,' + ' Gradient could be explored if sentence is too long') + args = parser.parse_args() + return args + + +word_dict = imdb.word_dict() + + +def crop_sentence(reader, crop_size): + unk_value = word_dict[''] + + def __impl__(): + for item in reader(): + if len([x for x in item[0] if x != unk_value]) < crop_size: + yield item + + return __impl__ + + +def main(): + args = parse_args() + lstm_size = args.hidden_dim + + data = fluid.layers.data( + name="words", shape=[1], lod_level=1, dtype='int64') + sentence = fluid.layers.embedding( + input=data, size=[len(word_dict), args.emb_dim]) + + sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common( + ipt, + hidden, + size, ): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + gate = fluid.layers.sums(input=[gate0, gate1]) + return gate + + forget_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + input_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + output_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + cell_gate = fluid.layers.tanh( + x=gate_common(word, prev_hidden, lstm_size)) + + cell = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul( + x=input_gate, y=cell_gate) + ]) + + hidden = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell)) + + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_pool(rnn(), 'last') + logit = fluid.layers.fc(input=last, size=2, act='softmax') + loss = fluid.layers.cross_entropy( + input=logit, + label=fluid.layers.data( + name='label', shape=[1], dtype='int64')) + loss = fluid.layers.mean(x=loss) + + # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ + shape=[1], dtype='int64'), total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + adam = fluid.optimizer.Adam() + adam.minimize(loss) + + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + def train_loop(pass_num, crop_size): + with profiler.profiler(args.device, 'total') as prof: + for pass_id in range(pass_num): + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), crop_size), + buf_size=25000), + batch_size=args.batch_size) + word_nums = 0 + pass_start_time = time.time() + for batch_id, data in enumerate(train_reader()): + tensor_words = to_lodtensor([x[0] for x in data], place) + for x in data: + word_nums += len(x[0]) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % + (pass_id, batch_id, loss_np, acc)) + + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + words_per_sec = word_nums / time_consumed + print("pass_id=%d, sec/pass: %f, words/s: %f" % + (pass_id, time_consumed, words_per_sec)) + + train_loop(args.pass_num, args.crop_size) + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = numpy.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +if __name__ == '__main__': + main() diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf78e4cf08d43127a05c740fa30ca6d2bc416b0 --- /dev/null +++ b/benchmark/fluid/vgg.py @@ -0,0 +1,220 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + opts = optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + # test + def test(exe): + test_accuracy = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) + return test_accuracy.eval() + + iters, num_samples, start_time = 0, 0, time.time() + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + accuracy.add(value=acc, weight=weight) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + pass_train_acc = accuracy.eval() + train_losses.append(loss) + train_accs.append(acc) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + train_elapsed = time.time() - start_time + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == "__main__": + print_arguments() + main() diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644 --- a/doc/v2/faq/build_and_install/index_en.rst +++ b/doc/v2/faq/build_and_install/index_en.rst @@ -1,5 +1,143 @@ -############################ -Install, Build and Unit test -############################ +.. _install_faq: -TBD +############################### +Compile, Install, and Unit Test +############################### + +.. contents:: + +1. Insufficient CUDA driver version +---------------------------------------------------------------- + +Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory. +You can solve the issue by running the following commands: + +.. code-block:: bash + + $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu + +For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation `_ . + + +2. Version mismatch between PythonLibs and PythonInterpreter +---------------------------------------------------------------- + +It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows. + + .. code-block:: bash + + cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= + +You should specify ````, ````, ```` to your local paths. + +3. PaddlePaddle version is 0.0.0 +------------------------------------------------ +This issue would happen when you run the code `paddle version` or `cmake ..` + +.. code-block:: bash + + CMake Warning at cmake/version.cmake:20 (message): + Cannot add paddle version from git tag + +You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake` + +4. paddlepaddle\*.whl is not a supported wheel on this platform. +------------------------------------------------------------------------ + +The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1. + +You can upgrade Pip with the following command\: + +.. code-block:: bash + + pip install --upgrade pip + +If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation. + +If the system supports :code:`linux_x86_64` and the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest + +if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again. + + +5. ImportError: No module named v2 +---------------------------------- +Please uninstall Paddle V1 if you have installed it before. + +.. code-block:: bash + + pip uninstall py_paddle paddle + +Then install Python for PaddlePaddle , enter the build directory and run the following commands + +pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl + +6. Illegal instruction +----------------------- +This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version. + +7. Python unittest fails +-------------------------------- + +If the following python unittest testcases fail: + +.. code-block:: bash + + 24 - test_PyDataProvider (Failed) + 26 - test_RecurrentGradientMachine (Failed) + 27 - test_NetworkCompare (Failed) + 28 - test_PyDataProvider2 (Failed) + 32 - test_Prediction (Failed) + 33 - test_Compare (Failed) + 34 - test_Trainer (Failed) + 35 - test_TrainerOnePass (Failed) + 36 - test_CompareTwoNets (Failed) + 37 - test_CompareTwoOpts (Failed) + 38 - test_CompareSparse (Failed) + 39 - test_recurrent_machine_generation (Failed) + 40 - test_PyDataProviderWrapper (Failed) + 41 - test_config_parser (Failed) + 42 - test_swig_api (Failed) + 43 - layers_test (Failed) + +Please check the PaddlePaddle unittest logs which may suggest the following: + +.. code-block:: bash + + paddle package is already in your PYTHONPATH. But unittest need a clean environment. + Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. + +The solution is: + +* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory. Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package. + + +8. Failed to download the MKLML library +---------------------------------------------- + +.. code-block:: bash + + make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4 + make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2 + make[1]: *** waiting for the unfinished jobs.... + +Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully. + +The solution is: manually download and install, the specific steps are as follows. + +.. code-block:: bash + + // 1. enter the directory + cd build/third_party/mklml/src/extern_mklml + + // 2. check the size of the package, normally 75M, if less than 75M, the download fails + du -sh mklml_lnx_2018.0.1.20171007.tgz + + // 3. manually download and unzip and make the download success tag: + wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz + tar zxf mklml_lnx_2018.0.1.20171007.tgz + touch ../extern_mklml-stamp/extern_mklml-download + + // 4. then compile + diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index a1b913a863cc1853ea3a786d22e6e8baa8c98a02..c277bd7cb69bba899296efe64107ee538c4aa847 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -55,6 +55,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { auto graph = new SSAGraph(); SSAGraph &result = *graph; + std::unordered_set og_has_been_broadcast; result.vars_.resize(places_.size()); bool is_forwarding = true; @@ -122,9 +123,15 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (!is_forwarding) { auto var_names = op->OutputArgumentNames(); + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. But there are no + // other cases, for example, we need to adjust the gradient according to + // the input when we get the gradient, which is not considered at present. for (auto &og : var_names) { - if (grad_names_.count(og) != 0) { // is param grad - // Insert NCCL AllReduce Op + if (grad_names_.count(og) != 0 && + og_has_been_broadcast.count(og) == 0) { // is param grad + // Insert NCCL AllReduce Op + og_has_been_broadcast.insert(og); #ifdef PADDLE_WITH_CUDA result.ops_.emplace_back( new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 893cc15f6c8b34fcfc33554f8ef48ffeb00cd75c..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -22,7 +22,7 @@ namespace paddle { namespace framework { namespace details { -struct OpHandleBase; +class OpHandleBase; // VarHandleBase is the var node in the dependency graph. // A variable can only be generated by a single operator. i.e. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 91f2db9354c2a00ec7e51ea4595c7cfa00da23ea..577eea92d217f9feb54085b4d9f071494c3c5165 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" -#include -#include "ThreadPool.h" +#include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..9458d56a01df432aea573d796456b9be31350038 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -10,6 +10,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" @@ -52,7 +55,7 @@ class SelectedRows { private: // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. - // SelectedRows are simplely concated when adding together. Until a + // SelectedRows are simply concated when adding together. Until a // SelectedRows add a Tensor, will the duplicate rows be handled. Vector rows_; std::unique_ptr value_{nullptr}; diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..deabcdc99f819851b2df9bb0c7b05a5b339568f3 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -18,6 +18,22 @@ limitations under the License. */ namespace paddle { namespace operators { +static inline framework::OpKernelType ExpectedKernelType( + const framework::ExecutionContext& ctx) { + auto* table_var = ctx.InputVar("W"); + if (table_var->IsType()) { + return framework::OpKernelType( + framework::ToDataType(table_var->Get().type()), + ctx.device_context()); + } else if (table_var->IsType()) { + return framework::OpKernelType( + framework::ToDataType(table_var->Get().value().type()), + ctx.device_context()); + } else { + PADDLE_THROW("W should be LoDTensor or SelectedRows"); + } +} + class LookupTableOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("W")->type()), - ctx.device_context()); + return ExpectedKernelType(ctx); } }; @@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "If the value is -1, it makes no effect to lookup. " "Otherwise the given value indicates padding the output " "with zeros whenever lookup encounters it in Ids.") - .SetDefault(-1); + .SetDefault(kNoPadding); AddComment(R"DOC( Lookup Table Operator. @@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("W")->type()), - ctx.device_context()); + return ExpectedKernelType(ctx); } }; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index c92ce78eeffb8f1517e61c6d6624d406e04d974d..fff5edda62d4b115605a4cab35ed5457b4db5f21 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -25,16 +28,37 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +static constexpr int64_t kNoPadding = -1; + +inline size_t getIndex(const std::vector &rows, int64_t value) { + auto it = std::find(rows.begin(), rows.end(), value); + PADDLE_ENFORCE(it != rows.end(), "id should be in rows"); + return static_cast(std::distance(rows.begin(), it)); +} template class LookupTableKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* table_t = context.Input("W"); - auto* ids_var = context.InputVar("Ids"); - Tensor* output_t = context.Output("Out"); + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + auto *ids_var = context.InputVar("Ids"); + Tensor *output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); + + DDim table_dim; - int64_t* ids; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW("table only support LoDTensor and SelectedRows"); + } + + int64_t *ids; int64_t ids_numel; // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type @@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel { // when Ids's type is SelectedRows, the rows of Ids contains the // ids to be looked up in W. if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); + auto *ids_t = context.Input("Ids"); + ids = const_cast(ids_t->data()); ids_numel = ids_t->numel(); } else if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().data()); + auto *ids_t = context.Input("Ids"); + ids = const_cast(ids_t->rows().data()); ids_numel = ids_t->rows().size(); - output_t->Resize({ids_numel, table_t->dims()[1]}); + output_t->Resize({ids_numel, table_dim[1]}); } else { PADDLE_THROW("Unsupported Variable Type of Ids"); } - int64_t padding_idx = context.Attr("padding_idx"); + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; - int N = table_t->dims()[0]; - int D = table_t->dims()[1]; - auto* table = table_t->data(); - auto* output = output_t->mutable_data(context.GetPlace()); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); - if (padding_idx == -1) { for (int64_t i = 0; i < ids_numel; ++i) { - PADDLE_ENFORCE_LT(ids[i], N); - PADDLE_ENFORCE_GE(ids[i], 0); - memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } } - } else { + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + for (int64_t i = 0; i < ids_numel; ++i) { - if (ids[i] == padding_idx) { - memset(output + i * D, 0, D * sizeof(T)); + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); } else { - PADDLE_ENFORCE_LT(ids[i], N); PADDLE_ENFORCE_GE(ids[i], 0); - memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + auto id_index = getIndex(table_t.rows(), ids[i]); + memcpy(output + i * row_width, table + id_index * row_width, + row_width * sizeof(T)); } } } @@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel { template class LookupTableGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW("table only support LoDTensor and SelectedRows"); + } + bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - auto* ids = context.Input("Ids"); - auto* table = context.Input("W"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); framework::Vector new_rows; @@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel { } d_table->set_rows(new_rows); - auto* d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_dim[0], table->dims()[1]}); + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table_dim[1]}); d_table_value->mutable_data(context.GetPlace()); - d_table->set_height(table->dims()[0]); + d_table->set_height(table_dim[0]); - auto* d_output_data = d_output->data(); - auto* d_table_data = d_table_value->data(); + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); } else { - auto* ids = context.Input("Ids"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); - auto* table = context.Input("W"); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); - int N = table->dims()[0]; + int N = table_dim[0]; int D = d_output->dims()[1]; - auto* d_output_data = d_output->data(); - auto* d_table_data = d_table->mutable_data(context.GetPlace()); + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); memset(d_table_data, 0, d_table->numel() * sizeof(T)); diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index 90f6f955cea51ded2dbb2bde459113458d7749a4..a31d64e899df33f16f707e96d7ff7b85eca8d6ea 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test { TEST_F(NCCLTester, ncclInitOp) {} // ncclAllReduceOp with desc +// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367 +/* TEST_F(NCCLTester, ncclAllReduceOp) { std::unique_ptr op2(new f::OpDesc); op2->SetType("ncclAllReduce"); @@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { } } } +*/ // ncclReduceOp with desc TEST_F(NCCLTester, ncclReduceOp) { @@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) { } // ncclBcastOp with desc +// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540 +/* TEST_F(NCCLTester, ncclBcastOp) { std::unique_ptr op2(new f::OpDesc); const int kRoot = 0; @@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) { ASSERT_NEAR(ct[j], result, 1e-5); } } +*/ diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 141a3eb93555c32efabc2465dc6daadf41c9d659..f9a8058f2a32b6736d6513b017b761a31ddc2e37 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -20,12 +20,29 @@ namespace paddle { namespace operators { namespace reader { -static constexpr size_t kDoubleBufferSize = 2; +// 'Double buffer' means we shall maintain two batches of input data at the same +// time. So the kCacheSize shoul be at least 2. +static constexpr size_t kCacheSize = 2; +// There will be two bacthes out of the channel during training: +// 1. the one waiting to be sent to the channel +// 2. the one just be received from the channel, which is also being used by +// subsequent operators. +// So the channel size should be kChacheSize - 2 +static constexpr size_t kChannelSize = 0; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: struct Item { Item() : ctx_(nullptr) {} + Item(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + } + Item& operator=(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + return *this; + } std::vector payloads_; platform::DeviceContext* ctx_; @@ -34,42 +51,44 @@ class DoubleBufferReader : public framework::DecoratedReader { explicit DoubleBufferReader( ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { - for (size_t i = 0; i < kDoubleBufferSize; ++i) { - if (platform::is_gpu_place(place_)) { #ifdef PADDLE_WITH_CUDA + for (size_t i = 0; i < kCacheSize; ++i) { + if (platform::is_gpu_place(place_)) { ctxs_.emplace_back(new platform::CUDADeviceContext( boost::get(place_))); -#endif } } - - start_thread(); - } - - void start_thread() { - buffer_ = framework::MakeChannel(kDoubleBufferSize); - prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); +#endif + StartPrefetcher(); } + bool HasNext() const override; void ReadNext(std::vector* out) override; void ReInit() override; - ~DoubleBufferReader() { - buffer_->Close(); - prefetcher_.join(); - delete buffer_; + ~DoubleBufferReader() { EndPrefetcher(); } + + private: + void StartPrefetcher() { + channel_ = framework::MakeChannel(kChannelSize); + prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); } - bool HasNext() const override; + void EndPrefetcher() { + channel_->Close(); + if (prefetcher_.joinable()) { + prefetcher_.join(); + } + delete channel_; + channel_ = nullptr; + } - private: void PrefetchThreadFunc(); std::thread prefetcher_; - framework::Channel* buffer_; + framework::Channel* channel_; platform::Place place_; std::vector> ctxs_; - mutable Item local_buffer_; }; class CreateDoubleBufferReaderOp : public framework::OperatorBase { @@ -123,70 +142,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { } }; +bool DoubleBufferReader::HasNext() const { + while (!channel_->IsClosed() && !channel_->CanReceive()) { + } + return channel_->CanReceive(); +} + void DoubleBufferReader::ReadNext(std::vector* out) { if (!HasNext()) { PADDLE_THROW("There is no next data!"); } - if (local_buffer_.payloads_.empty()) { - buffer_->Receive(&local_buffer_); - } - *out = local_buffer_.payloads_; - local_buffer_.payloads_.clear(); - if (local_buffer_.ctx_) { - local_buffer_.ctx_->Wait(); + Item batch; + channel_->Receive(&batch); + *out = batch.payloads_; + if (batch.ctx_) { + batch.ctx_->Wait(); } } void DoubleBufferReader::ReInit() { reader_->ReInit(); - buffer_->Close(); - prefetcher_.join(); - delete buffer_; - start_thread(); + EndPrefetcher(); + StartPrefetcher(); } void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "A new prefetch thread starts."; - size_t gpu_ctx_offset = 0; + std::vector> cpu_tensor_cache(kCacheSize); + std::vector> gpu_tensor_cache(kCacheSize); + size_t cached_tensor_id = 0; + while (reader_->HasNext()) { Item batch; - reader_->ReadNext(&batch.payloads_); + auto& cpu_batch = cpu_tensor_cache[cached_tensor_id]; + reader_->ReadNext(&cpu_batch); if (platform::is_gpu_place(place_)) { - std::vector gpu_batch; - auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++]; - gpu_ctx_offset %= this->ctxs_.size(); - gpu_batch.resize(batch.payloads_.size()); - for (size_t i = 0; i < batch.payloads_.size(); ++i) { - framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx, - &gpu_batch[i]); - gpu_batch[i].set_lod(batch.payloads_[i].lod()); + auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; + auto* gpu_ctx = ctxs_[cached_tensor_id].get(); + gpu_batch.resize(cpu_batch.size()); + for (size_t i = 0; i < cpu_batch.size(); ++i) { + framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); + gpu_batch[i].set_lod(cpu_batch[i].lod()); } - batch.ctx_ = gpu_ctx.get(); - std::swap(gpu_batch, batch.payloads_); + batch.payloads_ = gpu_batch; + batch.ctx_ = gpu_ctx; + } else { + // CPUPlace + batch.payloads_ = cpu_batch; } + ++cached_tensor_id; + cached_tensor_id %= kCacheSize; try { - buffer_->Send(&batch); + channel_->Send(&batch); } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The double buffer channel has been closed. The " "prefetch thread will terminate."; break; } } - buffer_->Close(); + channel_->Close(); VLOG(5) << "Prefetch thread terminates."; } -bool DoubleBufferReader::HasNext() const { - if (local_buffer_.payloads_.empty()) { - bool ok = buffer_->Receive(&local_buffer_); - return ok; - } else { - return true; - } -} - } // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index f916295cd7bc762e2052553b321344845f504648..4885b74e6c6644704cff01dbf49975d6e87ce0c4 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -104,7 +104,9 @@ EOF # make install should also be test when unittest make install -j `nproc` pip install /usr/local/opt/paddle/share/wheels/*.whl - paddle version + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then + paddle version + fi fi } @@ -183,6 +185,14 @@ EOF NCCL_DEPS="" fi + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then + PADDLE_VERSION="paddle version" + CMD='"paddle", "version"' + else + PADDLE_VERSION="true" + CMD='"true"' + fi + cat >> /paddle/build/Dockerfile <= line_count and i % line_count == 0: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + lines = [] + indx_f += 1 + if lines: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + + +def cluster_files_reader(files_pattern, + trainer_count, + trainer_id, + loader=cPickle.load): + """ + Create a reader that yield element from the given files, select + a file set according trainer count and trainer_id + + :param files_pattern: the files which generating by split(...) + :param trainer_count: total trainer count + :param trainer_id: the trainer rank id + :param loader: is a callable function that load object from file, this + function will be called as loader(f) and f is a file object. + Default is cPickle.load + """ + + def reader(): + if not callable(loader): + raise TypeError("loader should be callable.") + file_list = glob.glob(files_pattern) + file_list.sort() + my_file_list = [] + for idx, fn in enumerate(file_list): + if idx % trainer_count == trainer_id: + print "append file: %s" % fn + my_file_list.append(fn) + for fn in my_file_list: + with open(fn, "r") as f: + lines = loader(f) + for line in lines: + yield line + + return reader + + +def convert(output_path, reader, line_count, name_prefix): + import recordio + """ + Convert data from reader to recordio format files. + + :param output_path: directory in which output files will be saved. + :param reader: a data reader, from which the convert program will read + data instances. + :param name_prefix: the name prefix of generated files. + :param max_lines_to_shuffle: the max lines numbers to shuffle before + writing. + """ + + assert line_count >= 1 + indx_f = 0 + + def write_data(indx_f, lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: + # FIXME(Yancey1989): + # dumps with protocol: pickle.HIGHEST_PROTOCOL + writer.write(cPickle.dumps(l)) + writer.close() + + lines = [] + for i, d in enumerate(reader()): + lines.append(d) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) + lines = [] + indx_f += 1 + continue + + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 0000000000000000000000000000000000000000..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,257 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. +""" + +import tarfile +import gzip +import itertools +import paddle.v2.dataset.common + +__all__ = ['test, get_dict', 'get_embedding', 'convert'] + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_label_dict(filename): + d = dict() + tag_dict = set() + with open(filename, 'r') as f: + for i, line in enumerate(f): + line = line.strip() + if line.startswith("B-"): + tag_dict.add(line[2:]) + elif line.startswith("I-"): + tag_dict.add(line[2:]) + index = 0 + for tag in tag_dict: + d["B-" + tag] = index + index += 1 + d["I-" + tag] = index + index += 1 + d["O"] = index + return d + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + pred_idx = [predicate_dict.get(predicate)] * sen_len + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx + + return reader + + +def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ + word_dict = load_dict( + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', + WORDDICT_MD5)) + verb_dict = load_dict( + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', + VERBDICT_MD5)) + label_dict = load_label_dict( + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', + TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ + return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + """ + Conll05 test set creator. + + Because the training dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. + + :return: Training reader creator + :rtype: callable + """ + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) + + +def fetch(): + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdddeaabec733ef26b3f766c6437f5c53d65044 --- /dev/null +++ b/python/paddle/v2/dataset/flowers.py @@ -0,0 +1,199 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module will download dataset from +http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html +and parse train/test set intopaddle reader creators. + +This set contains images of flowers belonging to 102 different categories. +The images were acquired by searching the web and taking pictures. There are a +minimum of 40 images for each category. + +The database was used in: + +Nilsback, M-E. and Zisserman, A. Automated flower classification over a large + number of classes.Proceedings of the Indian Conference on Computer Vision, +Graphics and Image Processing (2008) +http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. + +""" +import cPickle +import itertools +import functools +from common import download +import tarfile +import scipy.io as scio +from paddle.v2.image import * +from paddle.v2.reader import * +import os +import numpy as np +from multiprocessing import cpu_count +__all__ = ['train', 'test', 'valid'] + +DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' +LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' +SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' +LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' +SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' +# In official 'readme', tstid is the flag of test data +# and trnid is the flag of train data. But test data is more than train data. +# So we exchange the train data and test data. +TRAIN_FLAG = 'tstid' +TEST_FLAG = 'trnid' +VALID_FLAG = 'valid' + + +def default_mapper(is_train, sample): + ''' + map image bytes data to type needed by model input layer + ''' + img, label = sample + img = load_image_bytes(img) + img = simple_transform( + img, 256, 224, is_train, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +train_mapper = functools.partial(default_mapper, True) +test_mapper = functools.partial(default_mapper, False) + + +def reader_creator(data_file, + label_file, + setid_file, + dataset_name, + mapper, + buffered_size=1024, + use_xmap=True): + ''' + 1. read images from tar file and + merge images into batch files in 102flowers.tgz_batch/ + 2. get a reader to read sample from batch file + + :param data_file: downloaded data file + :type data_file: string + :param label_file: downloaded label file + :type label_file: string + :param setid_file: downloaded setid file containing information + about how to split dataset + :type setid_file: string + :param dataset_name: data set name (tstid|trnid|valid) + :type dataset_name: string + :param mapper: a function to map image bytes data to type + needed by model input layer + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: data reader + :rtype: callable + ''' + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[dataset_name][0] + img2label = {} + for i in indexes: + img = "jpg/image_%05d.jpg" % i + img2label[img] = labels[i - 1] + file_list = batch_images_from_tar(data_file, dataset_name, img2label) + + def reader(): + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + + if use_xmap: + return xmap_readers(mapper, reader, cpu_count(), buffered_size) + else: + return map_readers(mapper, reader) + + +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers training set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: train data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, + buffered_size, use_xmap) + + +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers test set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, + buffered_size, use_xmap) + + +def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers validation set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, + buffered_size, use_xmap) + + +def fetch(): + download(DATA_URL, 'flowers', DATA_MD5) + download(LABEL_URL, 'flowers', LABEL_MD5) + download(SETID_URL, 'flowers', SETID_MD5) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py new file mode 100644 index 0000000000000000000000000000000000000000..37c4296f9bcea7e16daa46f778934331513c30c4 --- /dev/null +++ b/python/paddle/v2/dataset/imdb.py @@ -0,0 +1,148 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +IMDB dataset. + +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. +""" + +import paddle.v2.dataset.common +import collections +import tarfile +import re +import string + +__all__ = ['build_dict', 'train', 'test', 'convert'] + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', + MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip("\n\r").translate( + None, string.punctuation).lower().split() + tf = tarf.next() + + +def build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + for doc in tokenize(pattern): + for word in doc: + word_freq[word] += 1 + + # Not sure if we should prune less-frequent words here. + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx) + + +def word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'imdb', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py new file mode 100644 index 0000000000000000000000000000000000000000..617c722c4165cdfed9e650fc968d623ef6ed4391 --- /dev/null +++ b/python/paddle/v2/dataset/imikolov.py @@ -0,0 +1,161 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +imikolov's simple dataset. + +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set +into paddle reader creators. +""" +import paddle.v2.dataset.common +import collections +import tarfile + +__all__ = ['train', 'test', 'build_dict', 'convert'] + +URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' +MD5 = '30177ea32e27c525793142b6bf2c8e2d' + + +class DataType(object): + NGRAM = 1 + SEQ = 2 + + +def word_count(f, word_freq=None): + if word_freq is None: + word_freq = collections.defaultdict(int) + + for l in f: + for w in l.strip().split(): + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 + + return word_freq + + +def build_dict(min_word_freq=50): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + trainf = tf.extractfile(train_filename) + testf = tf.extractfile(test_filename) + word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] + + word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items()) + + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + + return word_idx + + +def reader_creator(filename, word_idx, n, data_type): + def reader(): + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + f = tf.extractfile(filename) + + UNK = word_idx[''] + for l in f: + if DataType.NGRAM == data_type: + assert n > -1, 'Invalid gram length' + l = [''] + l.strip().split() + [''] + if len(l) >= n: + l = [word_idx.get(w, UNK) for w in l] + for i in range(n, len(l) + 1): + yield tuple(l[i - n:i]) + elif DataType.SEQ == data_type: + l = l.strip().split() + l = [word_idx.get(w, UNK) for w in l] + src_seq = [word_idx['']] + l + trg_seq = l + [word_idx['']] + if n > 0 and len(src_seq) > n: continue + yield src_seq, trg_seq + else: + assert False, 'Unknow data type' + + return reader + + +def train(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov training set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Training reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n, + data_type) + + +def test(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Test reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n, + data_type) + + +def fetch(): + paddle.v2.dataset.common.download(URL, "imikolov", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + N = 5 + word_dict = build_dict() + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 1000, + "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..9f675bed895223e054cd3bb6e504fe1607f19858 --- /dev/null +++ b/python/paddle/v2/dataset/mnist.py @@ -0,0 +1,123 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MNIST dataset. + +This module will download dataset from http://yann.lecun.com/exdb/mnist/ and +parse training set and test set into paddle reader creators. +""" +import paddle.v2.dataset.common +import subprocess +import numpy +import platform +__all__ = ['train', 'test', 'convert'] + +URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' +TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' +TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3' +TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz' +TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c' +TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz' +TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' +TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz' +TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432' + + +def reader_creator(image_filename, label_filename, buffer_size): + def reader(): + if platform.system() == 'Darwin': + zcat_cmd = 'gzcat' + elif platform.system() == 'Linux': + zcat_cmd = 'zcat' + else: + raise NotImplementedError() + + # According to http://stackoverflow.com/a/38061619/724872, we + # cannot use standard package gzip here. + m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE) + m.stdout.read(16) # skip some magic bytes + + l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE) + l.stdout.read(8) # skip some magic bytes + + try: # reader could be break. + while True: + labels = numpy.fromfile( + l.stdout, 'ubyte', count=buffer_size).astype("int") + + if labels.size != buffer_size: + break # numpy.fromfile returns empty slice after EOF. + + images = numpy.fromfile( + m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape( + (buffer_size, 28 * 28)).astype('float32') + + images = images / 255.0 * 2.0 - 1.0 + + for i in xrange(buffer_size): + yield images[i, :], int(labels[i]) + finally: + m.terminate() + l.terminate() + + return reader + + +def train(): + """ + MNIST training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', + TRAIN_IMAGE_MD5), + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', + TRAIN_LABEL_MD5), 100) + + +def test(): + """ + MNIST test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', + TEST_IMAGE_MD5), + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', + TEST_LABEL_MD5), 100) + + +def fetch(): + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py new file mode 100644 index 0000000000000000000000000000000000000000..5b61a9420af1bb81e1d826f8a7b69f34c306d382 --- /dev/null +++ b/python/paddle/v2/dataset/movielens.py @@ -0,0 +1,262 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Movielens 1-M dataset. + +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training +set and test set into paddle reader creators. + +""" + +import zipfile +import paddle.v2.dataset.common +import re +import random +import functools + +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] + +URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' +MD5 = 'c4d9eecfca2ab87c1945afe126590906' + + +class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ + + def __init__(self, index, categories, title): + self.index = int(index) + self.categories = categories + self.title = title + + def value(self): + """ + Get information from a movie. + """ + return [ + self.index, [CATEGORIES_DICT[c] for c in self.categories], + [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] + ] + + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + + +class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ + + def __init__(self, index, gender, age, job_id): + self.index = int(index) + self.is_male = gender == 'M' + self.age = age_table.index(int(age)) + self.job_id = int(job_id) + + def value(self): + """ + Get information from a user. + """ + return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + + +MOVIE_INFO = None +MOVIE_TITLE_DICT = None +CATEGORIES_DICT = None +USER_INFO = None + + +def __initialize_meta_info__(): + fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) + global MOVIE_INFO + if MOVIE_INFO is None: + pattern = re.compile(r'^(.*)\((\d+)\)$') + with zipfile.ZipFile(file=fn) as package: + for info in package.infolist(): + assert isinstance(info, zipfile.ZipInfo) + MOVIE_INFO = dict() + title_word_set = set() + categories_set = set() + with package.open('ml-1m/movies.dat') as movie_file: + for i, line in enumerate(movie_file): + movie_id, title, categories = line.strip().split('::') + categories = categories.split('|') + for c in categories: + categories_set.add(c) + title = pattern.match(title).group(1) + MOVIE_INFO[int(movie_id)] = MovieInfo( + index=movie_id, categories=categories, title=title) + for w in title.split(): + title_word_set.add(w.lower()) + + global MOVIE_TITLE_DICT + MOVIE_TITLE_DICT = dict() + for i, w in enumerate(title_word_set): + MOVIE_TITLE_DICT[w] = i + + global CATEGORIES_DICT + CATEGORIES_DICT = dict() + for i, c in enumerate(categories_set): + CATEGORIES_DICT[c] = i + + global USER_INFO + USER_INFO = dict() + with package.open('ml-1m/users.dat') as user_file: + for line in user_file: + uid, gender, age, job, _ = line.strip().split("::") + USER_INFO[int(uid)] = UserInfo( + index=uid, gender=gender, age=age, job_id=job) + return fn + + +def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): + fn = __initialize_meta_info__() + rand = random.Random(x=rand_seed) + with zipfile.ZipFile(file=fn) as package: + with package.open('ml-1m/ratings.dat') as rating: + for line in rating: + if (rand.random() < test_ratio) == is_test: + uid, mov_id, rating, _ = line.strip().split("::") + uid = int(uid) + mov_id = int(mov_id) + rating = float(rating) * 2 - 5.0 + + mov = MOVIE_INFO[mov_id] + usr = USER_INFO[uid] + yield usr.value() + mov.value() + [[rating]] + + +def __reader_creator__(**kwargs): + return lambda: __reader__(**kwargs) + + +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + """ + Get movie title dictionary. + """ + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + """ + Get the maximum value of movie id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + """ + Get the maximum value of user id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + """ + Get the maximum value of job id. + """ + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + """ + Get movie categoriges dictionary. + """ + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + """ + Get user info dictionary. + """ + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + """ + Get movie info dictionary. + """ + __initialize_meta_info__() + return MOVIE_INFO + + +def unittest(): + for train_count, _ in enumerate(train()()): + pass + for test_count, _ in enumerate(test()()): + pass + + print train_count, test_count + + +def fetch(): + paddle.v2.dataset.common.download(URL, "movielens", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") + + +if __name__ == '__main__': + unittest() diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py new file mode 100644 index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1 --- /dev/null +++ b/python/paddle/v2/dataset/mq2007.py @@ -0,0 +1,333 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MQ2007 dataset + +MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross +validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set, +validation set and testing set. + +MQ2007 dataset from website +http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators + +""" + +import os +import functools +import rarfile +from common import download +import numpy as np + +# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" +URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" +MD5 = "7be1640ae95c6408dab0ae7207bdc706" + + +def __initialize_meta_info__(): + """ + download and extract the MQ2007 dataset + """ + fn = fetch() + rar = rarfile.RarFile(fn) + dirpath = os.path.dirname(fn) + rar.extractall(path=dirpath) + return dirpath + + +class Query(object): + """ + queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors + + Parameters: + ---------- + query_id : int + query_id in dataset, mapping from query to relevance documents + relevance_score : int + relevance score of query and document pair + feature_vector : array, dense feature + feature in vector format + description : string + comment section in query doc pair data + """ + + def __init__(self, + query_id=-1, + relevance_score=-1, + feature_vector=None, + description=""): + self.query_id = query_id + self.relevance_score = relevance_score + if feature_vector is None: + self.feature_vector = [] + else: + self.feature_vector = feature_vector + self.description = description + + def __str__(self): + string = "%s %s %s" % (str(self.relevance_score), str(self.query_id), + " ".join(str(f) for f in self.feature_vector)) + return string + + # @classmethod + def _parse_(self, text): + """ + parse line into Query + """ + comment_position = text.find('#') + line = text[:comment_position].strip() + self.description = text[comment_position + 1:].strip() + parts = line.split() + if len(parts) != 48: + sys.stdout.write("expect 48 space split parts, get %d" % + (len(parts))) + return None + # format : 0 qid:10 1:0.000272 2:0.000000 .... + self.relevance_score = int(parts[0]) + self.query_id = int(parts[1].split(':')[1]) + for p in parts[2:]: + pair = p.split(':') + self.feature_vector.append(float(pair[1])) + return self + + +class QueryList(object): + """ + group query into list, every item in list is a Query + """ + + def __init__(self, querylist=None): + self.query_id = -1 + if querylist is None: + self.querylist = [] + else: + self.querylist = querylist + for query in self.querylist: + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + + def __iter__(self): + for query in self.querylist: + yield query + + def __len__(self): + return len(self.querylist) + + def __getitem__(self, i): + return self.querylist[i] + + def _correct_ranking_(self): + if self.querylist is None: + return + self.querylist.sort(key=lambda x: x.relevance_score, reverse=True) + + def _add_query(self, query): + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + self.querylist.append(query) + + +def gen_plain_txt(querylist): + """ + gen plain text in list for other usage + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + query_id : np.array, shape=(samples_num, ) + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield querylist.query_id, query.relevance_score, np.array( + query.feature_vector) + + +def gen_point(querylist): + """ + gen item in list for point-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield query.relevance_score, np.array(query.feature_vector) + + +def gen_pair(querylist, partial_order="full"): + """ + gen pair for pair-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + pairtial_order : "full" or "neighbour" + there is redudant in all possiable pair combinations, which can be simplifed + gen pairs for neighbour items or the full partial order pairs + + return : + ------ + label : np.array, shape=(1) + query_left : np.array, shape=(1, feature_dimension) + query_right : same as left + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + labels = [] + docpairs = [] + + # C(n,2) + for i in range(len(querylist)): + query_left = querylist[i] + for j in range(i + 1, len(querylist)): + query_right = querylist[j] + if query_left.relevance_score > query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_left.feature_vector), + np.array(query_right.feature_vector) + ]) + elif query_left.relevance_score < query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_right.feature_vector), + np.array(query_left.feature_vector) + ]) + for label, pair in zip(labels, docpairs): + yield np.array(label), pair[0], pair[1] + + +def gen_list(querylist): + """ + gen item in list for list-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + relevance_score_list = [[query.relevance_score] for query in querylist] + feature_vector_list = [query.feature_vector for query in querylist] + yield np.array(relevance_score_list), np.array(feature_vector_list) + + +def query_filter(querylists): + """ + filter query get only document with label 0. + label 0, 1, 2 means the relevance score document with query + parameters : + querylist : QueyList list + + return : + querylist : QueyList list + """ + filter_query = [] + for querylist in querylists: + relevance_score_list = [query.relevance_score for query in querylist] + if sum(relevance_score_list) != .0: + filter_query.append(querylist) + return filter_query + + +def load_from_text(filepath, shuffle=False, fill_missing=-1): + """ + parse data file into querys + """ + prev_query_id = -1 + querylists = [] + querylist = None + fn = __initialize_meta_info__() + with open(os.path.join(fn, filepath)) as f: + for line in f: + query = Query() + query = query._parse_(line) + if query == None: + continue + if query.query_id != prev_query_id: + if querylist is not None: + querylists.append(querylist) + querylist = QueryList() + prev_query_id = query.query_id + querylist._add_query(query) + if querylist is not None: + querylists.append(querylist) + return querylists + + +def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1): + """ + Parameters + -------- + filename : string + fill_missing : fill the missing value. default in MQ2007 is -1 + + Returns + ------ + yield + label query_left, query_right # format = "pairwise" + label querylist # format = "listwise" + """ + querylists = query_filter( + load_from_text( + filepath, shuffle=shuffle, fill_missing=fill_missing)) + for querylist in querylists: + if format == "plain_txt": + yield next(gen_plain_txt(querylist)) + elif format == "pointwise": + yield next(gen_point(querylist)) + elif format == "pairwise": + for pair in gen_pair(querylist): + yield pair + elif format == "listwise": + yield next(gen_list(querylist)) + + +train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt") +test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") + + +def fetch(): + return download(URL, "MQ2007", MD5) + + +if __name__ == "__main__": + fetch() + mytest = functools.partial( + __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise") + for label, query in mytest(): + print label, query diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b9757c1a75d215cf8945b5cedbb1239fd43af7 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,141 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set that provided by NLTK + +TODO(yuyang18): Complete dataset. +""" + +import collections +from itertools import chain + +import nltk +from nltk.corpus import movie_reviews + +import paddle.v2.dataset.common + +__all__ = ['train', 'test', 'get_word_dict', 'convert'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default training set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) + + +def fetch(): + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9 --- /dev/null +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -0,0 +1,56 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.cifar +import unittest + + +class TestCIFAR(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3072) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_test10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test10()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + def test_train10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train10()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 9) + + def test_test100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test100()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 99) + + def test_train100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train100()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 99) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cfa194eba38ea70311c4deeac2635dc0a0103576 --- /dev/null +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -0,0 +1,94 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.common +import unittest +import tempfile +import glob + + +class TestCommon(unittest.TestCase): + def test_md5file(self): + _, temp_path = tempfile.mkstemp() + with open(temp_path, 'w') as f: + f.write("Hello\n") + self.assertEqual('09f7e02f1290be211da707a266f153b3', + paddle.v2.dataset.common.md5file(temp_path)) + + def test_download(self): + yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' + self.assertEqual( + paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', + paddle.v2.dataset.common.download( + yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) + + def test_split(self): + def test_reader(): + def reader(): + for x in xrange(10): + yield x + + return reader + + _, temp_path = tempfile.mkstemp() + paddle.v2.dataset.common.split( + test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') + files = glob.glob(temp_path + '/test-%05d.pickle') + self.assertEqual(len(files), 3) + + def test_cluster_file_reader(self): + _, temp_path = tempfile.mkstemp() + for x in xrange(5): + with open(temp_path + '/%05d.test' % x) as f: + f.write('%d\n' % x) + reader = paddle.v2.dataset.common.cluster_files_reader( + temp_path + '/*.test', 5, 0) + for idx, e in enumerate(reader()): + self.assertEqual(e, str("0")) + + def test_convert(self): + record_num = 10 + num_shards = 4 + + def test_reader(): + def reader(): + for x in xrange(record_num): + yield x + + return reader + + path = tempfile.mkdtemp() + paddle.v2.dataset.common.convert(path, + test_reader(), num_shards, + 'random_images') + + files = glob.glob(path + '/random_images-*') + self.assertEqual(len(files), num_shards) + + recs = [] + for i in range(0, num_shards): + n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1) + r = recordio.reader(n) + while True: + d = r.read() + if d is None: + break + recs.append(d) + + recs.sort() + self.assertEqual(total, record_num) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233 --- /dev/null +++ b/python/paddle/v2/dataset/tests/flowers_test.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.flowers +import unittest + + +class TestFlowers(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + size = 224 * 224 * 3 + for l in reader(): + self.assertEqual(l[0].size, size) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.train()) + self.assertEqual(instances, 6149) + self.assertEqual(max_label_value, 102) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.test()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + def test_valid(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.valid()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8 --- /dev/null +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -0,0 +1,57 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imdb +import unittest +import re + +TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$") +TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$") +TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$") + +TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$") +TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$") +TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$") + + +class TestIMDB(unittest.TestCase): + word_idx = None + + def test_build_dict(self): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + self.assertEqual(len(self.word_idx), 7036) + + def check_dataset(self, dataset, expected_size): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + sum = 0 + for l in dataset(self.word_idx): + self.assertEqual(l[1], sum % 2) + sum += 1 + self.assertEqual(sum, expected_size) + + def test_train(self): + self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + + def test_test(self): + self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py new file mode 100644 index 0000000000000000000000000000000000000000..714a75d6f1ff31697eec2d893d350a726d6390fe --- /dev/null +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imikolov +import unittest + +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + + +class TestMikolov(unittest.TestCase): + def check_reader(self, reader, n): + for l in reader(): + self.assertEqual(len(l), n) + + def test_train(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) + + first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ + 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ + 'rake regatta rubens sim snack-food ssangyong swapo wachter' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.train( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_test(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + first_line = 'consumers may want to move their telephones a little '\ + 'closer to the tv set' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.test( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55 --- /dev/null +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mnist +import unittest + + +class TestMNIST(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 784) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.train()) + self.assertEqual(instances, 60000) + self.assertEqual(max_label_value, 9) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.test()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py new file mode 100644 index 0000000000000000000000000000000000000000..59847b6c18eadb12123cae824e8bce1051a69d4c --- /dev/null +++ b/python/paddle/v2/dataset/tests/mq2007_test.py @@ -0,0 +1,33 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mq2007 +import unittest + + +class TestMQ2007(unittest.TestCase): + def test_pairwise(self): + for label, query_left, query_right in paddle.v2.dataset.mq2007.test( + format="pairwise"): + self.assertEqual(query_left.shape(), (46, )) + self.assertEqual(query_right.shape(), (46, )) + + def test_listwise(self): + for label_array, query_array in paddle.v2.dataset.mq2007.test( + format="listwise"): + self.assertEqual(len(label_array), len(query_array)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py new file mode 100644 index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353 --- /dev/null +++ b/python/paddle/v2/dataset/tests/voc2012_test.py @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.voc2012 +import unittest + + +class TestVOC(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3 * l[1].size) + sum += 1 + return sum + + def test_train(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.train()) + self.assertEqual(count, 2913) + + def test_test(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.test()) + self.assertEqual(count, 1464) + + def test_val(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.val()) + self.assertEqual(count, 1449) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cef6c3216e7de8d9785a063976e63f88d90b24df --- /dev/null +++ b/python/paddle/v2/dataset/tests/wmt16_test.py @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.wmt16 +import unittest + + +class TestWMT16(unittest.TestCase): + def checkout_one_sample(self, sample): + # train data has 3 field: source language word indices, + # target language word indices, and target next word indices. + self.assertEqual(len(sample), 3) + + # test start mark and end mark in source word indices. + self.assertEqual(sample[0][0], 0) + self.assertEqual(sample[0][-1], 1) + + # test start mask in target word indices + self.assertEqual(sample[1][0], 0) + + # test en mask in target next word indices + self.assertEqual(sample[2][-1], 1) + + def test_train(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.train( + src_dict_size=100000, trg_dict_size=100000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_test(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.test( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_val(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.validation( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_get_dict(self): + dict_size = 1000 + word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) + self.assertEqual(len(word_dict), dict_size) + self.assertEqual(word_dict[0], "") + self.assertEqual(word_dict[1], "") + self.assertEqual(word_dict[2], "") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 0000000000000000000000000000000000000000..f10bf7e42a1ead09b3eba0d61e55701215e4360f --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,134 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +UCI Housing dataset. + +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse training set and test set into paddle reader creators. +""" + +import numpy as np +import os +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT', 'convert' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None +URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar' +MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b' + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + """ + UCI_HOUSING training set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Training reader creator + :rtype: callable + """ + global UCI_TRAIN_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Test reader creator + :rtype: callable + """ + global UCI_TEST_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', + MD5_MODEL) + with open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py new file mode 100644 index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77 --- /dev/null +++ b/python/paddle/v2/dataset/voc2012.py @@ -0,0 +1,85 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image dataset for segmentation. +The 2012 dataset contains images from 2008-2011 for which additional +segmentations have been prepared. As in previous years the assignment +to training/test sets has been maintained. The total number of images +with segmentation has been increased from 7,062 to 9,993. +""" + +import tarfile +import io +import numpy as np +from paddle.v2.dataset.common import download +from paddle.v2.image import * +from PIL import Image + +__all__ = ['train', 'test', 'val'] + +VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\ +VOCtrainval_11-May-2012.tar' + +VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd' +SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt' +DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg' +LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png' + +CACHE_DIR = 'voc2012' + + +def reader_creator(filename, sub_name): + + tarobject = tarfile.open(filename) + name2mem = {} + for ele in tarobject.getmembers(): + name2mem[ele.name] = ele + + def reader(): + set_file = SET_FILE.format(sub_name) + sets = tarobject.extractfile(name2mem[set_file]) + for line in sets: + line = line.strip() + data_file = DATA_FILE.format(line) + label_file = LABEL_FILE.format(line) + data = tarobject.extractfile(name2mem[data_file]).read() + label = tarobject.extractfile(name2mem[label_file]).read() + data = Image.open(io.BytesIO(data)) + label = Image.open(io.BytesIO(label)) + data = np.array(data) + label = np.array(label) + yield data, label + + return reader + + +def train(): + """ + Create a train dataset reader containing 2913 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval') + + +def test(): + """ + Create a test dataset reader containing 1464 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train') + + +def val(): + """ + Create a val dataset reader containing 1449 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val') diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py new file mode 100644 index 0000000000000000000000000000000000000000..5104e29051e4480f3a7eb18421f1b519841b009b --- /dev/null +++ b/python/paddle/v2/dataset/wmt14.py @@ -0,0 +1,182 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse training set and test set into paddle reader creators. + +""" +import tarfile +import gzip + +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = [ + 'train', + 'test', + 'get_dict', + 'convert', +] + +URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' + 'cslm_joint_paper/data/dev+test.tgz') +MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' +# this is a small set of data for test. The original data is too large and +# will be add later. +URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' + 'wmt_shrinked_data/wmt14.tgz') +MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' +# BLEU of this trained model is 26.92 +URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict(tar_file, dict_size): + def __to_dict(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): + def reader(): + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) + ] + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(dict_size): + """ + WMT14 training set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) + + +def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) + + +def gen(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'gen/gen', dict_size) + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + with gzip.open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def get_dict(dict_size, reverse=True): + # if reverse = False, return dict = {'a':'001', 'b':'002', ...} + # else reverse = true, return dict = {'001':'a', '002':'b', ...} + tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + if reverse: + src_dict = {v: k for k, v in src_dict.items()} + trg_dict = {v: k for k, v in trg_dict.items()} + return src_dict, trg_dict + + +def fetch(): + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + + +def convert(path): + """ + Converts dataset to recordio format + """ + dict_size = 30000 + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py new file mode 100644 index 0000000000000000000000000000000000000000..c8818f715beadd9499ae588f2c19a57fbf26f372 --- /dev/null +++ b/python/paddle/v2/dataset/wmt16.py @@ -0,0 +1,349 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ACL2016 Multimodal Machine Translation. Please see this website for more +details: http://www.statmt.org/wmt16/multimodal-task.html#task1 + +If you use the dataset created for your task, please cite the following paper: +Multi30K: Multilingual English-German Image Descriptions. + +@article{elliott-EtAl:2016:VL16, + author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, + title = {Multi30K: Multilingual English-German Image Descriptions}, + booktitle = {Proceedings of the 6th Workshop on Vision and Language}, + year = {2016}, + pages = {70--74}, + year = 2016 +} +""" + +import os +import tarfile +import gzip +from collections import defaultdict + +import paddle.v2.dataset.common + +__all__ = [ + "train", + "test", + "validation", + "convert", + "fetch", + "get_dict", +] + +DATA_URL = ("http://cloud.dlnel.org/filepub/" + "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed") +DATA_MD5 = "0c38be43600334966403524a40dcd81e" + +TOTAL_EN_WORDS = 11250 +TOTAL_DE_WORDS = 19220 + +START_MARK = "" +END_MARK = "" +UNK_MARK = "" + + +def __build_dict(tar_file, dict_size, save_path, lang): + word_dict = defaultdict(int) + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile("wmt16/train"): + line_split = line.strip().split("\t") + if len(line_split) != 2: continue + sen = line_split[0] if lang == "en" else line_split[1] + for w in sen.split(): + word_dict[w] += 1 + + with open(save_path, "w") as fout: + fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) + for idx, word in enumerate( + sorted( + word_dict.iteritems(), key=lambda x: x[1], reverse=True)): + if idx + 3 == dict_size: break + fout.write("%s\n" % (word[0])) + + +def __load_dict(tar_file, dict_size, lang, reverse=False): + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + if not os.path.exists(dict_path) or ( + len(open(dict_path, "r").readlines()) != dict_size): + __build_dict(tar_file, dict_size, dict_path, lang) + + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip() + else: + word_dict[line.strip()] = idx + return word_dict + + +def __get_dict_size(src_dict_size, trg_dict_size, src_lang): + src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else + TOTAL_DE_WORDS)) + trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else + TOTAL_ENG_WORDS)) + return src_dict_size, trg_dict_size + + +def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): + def reader(): + src_dict = __load_dict(tar_file, src_dict_size, src_lang) + trg_dict = __load_dict(tar_file, trg_dict_size, + ("de" if src_lang == "en" else "en")) + + # the indice for start mark, end mark, and unk are the same in source + # language and target language. Here uses the source language + # dictionary to determine their indices. + start_id = src_dict[START_MARK] + end_id = src_dict[END_MARK] + unk_id = src_dict[UNK_MARK] + + src_col = 0 if src_lang == "en" else 1 + trg_col = 1 - src_col + + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile(file_name): + line_split = line.strip().split("\t") + if len(line_split) != 2: + continue + src_words = line_split[src_col].split() + src_ids = [start_id] + [ + src_dict.get(w, unk_id) for w in src_words + ] + [end_id] + + trg_words = line_split[trg_col].split() + trg_ids = [trg_dict.get(w, unk_id) for w in trg_words] + + trg_ids_next = trg_ids + [end_id] + trg_ids = [start_id] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 train set reader. + + This function returns the reader for train data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + + NOTE: + The original like for training data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The train reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. Only support: " + "en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/train", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def test(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 test set reader. + + This function returns the reader for test data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for test data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The test reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/test", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def validation(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 validation set reader. + + This function returns the reader for validation data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for validation data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The validation reader. + """ + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/val", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def get_dict(lang, dict_size, reverse=False): + """ + return the word dictionary for the specified language. + + Args: + lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + dict_size(int): Size of the specified language dictionary. + reverse(bool): If reverse is set to False, the returned python + dictionary will use word as key and use index as value. + If reverse is set to True, the returned python + dictionary will use index as key and word as value. + + Returns: + dict: The word dictionary for the specific language. + """ + + if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) + else: dict_size = min(dict_size, TOTAL_DE_WORDS) + + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + assert os.path.exists(dict_path), "Word dictionary does not exist. " + "Please invoke paddle.dataset.wmt16.train/test/validation first " + "to build the dictionary." + tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") + return __load_dict(tar_file, dict_size, lang, reverse) + + +def fetch(): + """download the entire dataset. + """ + paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz") + + +def convert(path, src_dict_size, trg_dict_size, src_lang): + """Converts dataset to recordio format. + """ + + paddle.v2.dataset.common.convert( + path, + train( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_train") + paddle.v2.dataset.common.convert( + path, + test( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_test") + paddle.v2.dataset.common.convert( + path, + validation( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_validation") diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py new file mode 100644 index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981 --- /dev/null +++ b/python/paddle/v2/image.py @@ -0,0 +1,381 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains some common interfaces for image preprocess. +Many users are confused about the image layout. We introduce +the image layout as follows. + +- CHW Layout + + - The abbreviations: C=channel, H=Height, W=Width + - The default layout of image opened by cv2 or PIL is HWC. + PaddlePaddle only supports the CHW layout. And CHW is simply + a transpose of HWC. It must transpose the input image. + +- Color format: RGB or BGR + + OpenCV use BGR color format. PIL use RGB color format. Both + formats can be used for training. Noted that, the format should + be keep consistent between the training and inference peroid. +""" +import numpy as np +try: + import cv2 +except ImportError: + cv2 = None +import os +import tarfile +import cPickle + +__all__ = [ + "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", + "random_crop", "left_right_flip", "simple_transform", "load_and_transform", + "batch_images_from_tar" +] + + +def batch_images_from_tar(data_file, + dataset_name, + img2label, + num_per_batch=1024): + """ + Read images from tar file and batch them into batch file. + + :param data_file: path of image tar file + :type data_file: string + :param dataset_name: 'train','test' or 'valid' + :type dataset_name: string + :param img2label: a dic with image file name as key + and image's label as value + :type img2label: dic + :param num_per_batch: image number per batch file + :type num_per_batch: int + :return: path of list file containing paths of batch file + :rtype: string + """ + batch_dir = data_file + "_batch" + out_path = "%s/%s" % (batch_dir, dataset_name) + meta_file = "%s/%s.txt" % (batch_dir, dataset_name) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + tf = tarfile.open(data_file) + mems = tf.getmembers() + data = [] + labels = [] + file_id = 0 + for mem in mems: + if mem.name in img2label: + data.append(tf.extractfile(mem).read()) + labels.append(img2label[mem.name]) + if len(data) == num_per_batch: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + file_id += 1 + data = [] + labels = [] + if len(data) > 0: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + +def load_image_bytes(bytes, is_color=True): + """ + Load an color or gray image from bytes array. + + Example usage: + + .. code-block:: python + + with open('cat.jpg') as f: + im = load_image_bytes(f.read()) + + :param bytes: the input image bytes array. + :type bytes: str + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + flag = 1 if is_color else 0 + file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) + img = cv2.imdecode(file_bytes, flag) + return img + + +def load_image(file, is_color=True): + """ + Load an color or gray image from the file path. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + + :param file: the input image path. + :type file: string + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + # cv2.IMAGE_COLOR for OpenCV3 + # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version + # cv2.IMAGE_GRAYSCALE for OpenCV3 + # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version + # Here, use constant 1 and 0 + # 1: COLOR, 0: GRAYSCALE + flag = 1 if is_color else 0 + im = cv2.imread(file, flag) + return im + + +def resize_short(im, size): + """ + Resize an image so that the length of shorter edge is size. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the shorter edge size of image after resizing. + :type size: int + """ + h, w = im.shape[:2] + h_new, w_new = size, size + if h > w: + h_new = size * h / w + else: + w_new = size * w / h + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + return im + + +def to_chw(im, order=(2, 0, 1)): + """ + Transpose the input image order. The image layout is HWC format + opened by cv2 or PIL. Transpose the input image to CHW layout + according the order (2,0,1). + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + im = to_chw(im) + + :param im: the input image with HWC layout. + :type im: ndarray + :param order: the transposed order. + :type order: tuple|list + """ + assert len(im.shape) == len(order) + im = im.transpose(order) + return im + + +def center_crop(im, size, is_color=True): + """ + Crop the center of image with size. + + Example usage: + + .. code-block:: python + + im = center_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = (h - size) / 2 + w_start = (w - size) / 2 + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def random_crop(im, size, is_color=True): + """ + Randomly crop input image with size. + + Example usage: + + .. code-block:: python + + im = random_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = np.random.randint(0, h - size + 1) + w_start = np.random.randint(0, w - size + 1) + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def left_right_flip(im, is_color=True): + """ + Flip an image along the horizontal direction. + Return the flipped image. + + Example usage: + + .. code-block:: python + + im = left_right_flip(im) + + :param im: input image with HWC layout or HW layout for gray image + :type im: ndarray + :param is_color: whether input image is color or not + :type is_color: bool + """ + if len(im.shape) == 3 and is_color: + return im[:, ::-1, :] + else: + return im[:, ::-1] + + +def simple_transform(im, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Simply data argumentation for training. These operations include + resizing, croping and flipping. + + Example usage: + + .. code-block:: python + + im = simple_transform(im, 256, 224, True) + + :param im: The input image with HWC layout. + :type im: ndarray + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = resize_short(im, resize_size) + if is_train: + im = random_crop(im, crop_size, is_color=is_color) + if np.random.randint(2) == 0: + im = left_right_flip(im, is_color) + else: + im = center_crop(im, crop_size, is_color) + im = center_crop(im, crop_size, is_color=is_color) + if len(im.shape) == 3: + im = to_chw(im) + + im = im.astype('float32') + if mean is not None: + mean = np.array(mean, dtype=np.float32) + # mean value, may be one value per channel + if mean.ndim == 1 and is_color: + mean = mean[:, np.newaxis, np.newaxis] + elif mean.ndim == 1: + mean = mean + else: + # elementwise mean + assert len(mean.shape) == len(im) + im -= mean + + return im + + +def load_and_transform(filename, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Load image from the input file `filename` and transform image for + data argumentation. Please refer to the `simple_transform` interface + for the transform operations. + + Example usage: + + .. code-block:: python + + im = load_and_transform('cat.jpg', 256, 224, True) + + :param filename: The file name of input image. + :type filename: string + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = load_image(filename, is_color) + im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) + return im diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py new file mode 100644 index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb --- /dev/null +++ b/python/paddle/v2/minibatch.py @@ -0,0 +1,41 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['batch'] + + +def batch(reader, batch_size): + """ + Create a batched reader. + + :param reader: the data reader to read from. + :type reader: callable + :param batch_size: size of each mini-batch + :type batch_size: int + :return: the batched reader. + :rtype: callable + """ + + def batch_reader(): + r = reader() + b = [] + for instance in r: + b.append(instance) + if len(b) == batch_size: + yield b + b = [] + if b: + yield b + + return batch_reader diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6 --- /dev/null +++ b/python/paddle/v2/reader/__init__.py @@ -0,0 +1,74 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +At training and testing time, PaddlePaddle programs need to read data. To ease +the users' work to write data reading code, we define that + +- A *reader* is a function that reads data (from file, network, random number + generator, etc) and yields data items. +- A *reader creator* is a function that returns a reader function. +- A *reader decorator* is a function, which accepts one or more readers, and + returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, + random number generator, etc) and yields a batch of data items. + +##################### +Data Reader Interface +##################### + +Indeed, *data reader* doesn't have to be a function that reads and yields data +items. It can be any function with no parameter that creates a iterable +(anything can be used in :code:`for x in iterable`)\: + +.. code-block:: python + + iterable = data_reader() + +Element produced from the iterable should be a **single** entry of data, +**not** a mini batch. That entry of data could be a single item, or a tuple of +items. +Item should be of `supported type `_ (e.g., numpy 1d +array of float32, int, list of int) + +An example implementation for single item data reader creator: + +.. code-block:: python + + def reader_creator_random_image(width, height): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader + +An example implementation for multiple item data reader creator: + +.. code-block:: python + + def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader + + +TODO(yuyang18): Should we add whole design doc here? +""" + +import decorator +from decorator import * + +import creator + +__all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py new file mode 100644 index 0000000000000000000000000000000000000000..fda5246d74f598200b439774a25e80ec3e504077 --- /dev/null +++ b/python/paddle/v2/reader/creator.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Creator package contains some simple reader creator, which could +be used in user program. +""" + +__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] + + +def np_array(x): + """ + Creates a reader that yields elements of x, if it is a + numpy vector. Or rows of x, if it is a numpy matrix. + Or any sub-hyperplane indexed by the highest dimension. + + :param x: the numpy array to create reader from. + :returns: data reader created from x. + """ + + def reader(): + if x.ndim < 1: + yield x + + for e in x: + yield e + + return reader + + +def text_file(path): + """ + Creates a data reader that outputs text line by line from given text file. + Trailing new line ('\\\\n') of each line will be removed. + + :path: path of the text file. + :returns: data reader of text file + """ + + def reader(): + f = open(path, "r") + for l in f: + yield l.rstrip('\n') + f.close() + + return reader + + +def recordio(paths, buf_size=100): + """ + Creates a data reader from given RecordIO file paths separated by ",", + glob pattern is supported. + :path: path of recordio files, can be a string or a string list. + :returns: data reader of recordio files. + """ + + import recordio as rec + import paddle.v2.reader.decorator as dec + import cPickle as pickle + + def reader(): + if isinstance(paths, basestring): + path = paths + else: + path = ",".join(paths) + f = rec.reader(path) + while True: + r = f.read() + if r is None: + break + yield pickle.loads(r) + f.close() + + return dec.buffered(reader, buf_size) + + +pass_num = 0 + + +def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): + """ + Create a data reader that yield a record one by one from + the paths: + :paths: path of recordio files, can be a string or a string list. + :etcd_endpoints: the endpoints for etcd cluster + :returns: data reader of recordio files. + + .. code-block:: python + from paddle.v2.reader.creator import cloud_reader + etcd_endpoints = "http://127.0.0.1:2379" + trainer.train.( + reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), + ) + """ + import os + import cPickle as pickle + import paddle.v2.master as master + c = master.client(etcd_endpoints, timeout_sec, buf_size) + + if isinstance(paths, basestring): + path = [paths] + else: + path = paths + c.set_dataset(path) + + def reader(): + global pass_num + c.paddle_start_get_records(pass_num) + pass_num += 1 + + while True: + r, e = c.next_record() + if not r: + if e != -2: + print "get record error: ", e + break + yield pickle.loads(r) + + return reader diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef --- /dev/null +++ b/python/paddle/v2/reader/decorator.py @@ -0,0 +1,405 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' +] + +from threading import Thread +import subprocess + +from Queue import Queue +import itertools +import random +import zlib + + +def map_readers(func, *readers): + """ + Creates a data reader that outputs return value of function using + output of each data readers as arguments. + + :param func: function to use. The type of func should be (Sample) => Sample + :type: callable + :param readers: readers whose outputs will be used as arguments of func. + :return: the created data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + for e in itertools.imap(func, *rs): + yield e + + return reader + + +def shuffle(reader, buf_size): + """ + Creates a data reader whose data output is shuffled. + + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + :param reader: the original reader whose output will be shuffled. + :type reader: callable + :param buf_size: shuffle buffer size. + :type buf_size: int + + :return: the new reader whose output is shuffled. + :rtype: callable + """ + + def data_reader(): + buf = [] + for e in reader(): + buf.append(e) + if len(buf) >= buf_size: + random.shuffle(buf) + for b in buf: + yield b + buf = [] + + if len(buf) > 0: + random.shuffle(buf) + for b in buf: + yield b + + return data_reader + + +def chain(*readers): + """ + Creates a data reader whose output is the outputs of input data + readers chained together. + + If input readers output following data entries: + [0, 0, 0] + [1, 1, 1] + [2, 2, 2] + The chained reader will output: + [0, 0, 0, 1, 1, 1, 2, 2, 2] + + :param readers: input readers. + :return: the new data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + + for e in itertools.chain(*rs): + yield e + + return reader + + +class ComposeNotAligned(ValueError): + pass + + +def compose(*readers, **kwargs): + """ + Creates a data reader whose output is the combination of input readers. + + If input readers output following data entries: + (1, 2) 3 (4, 5) + The composed reader will output: + (1, 2, 3, 4, 5) + + :param readers: readers that will be composed together. + :param check_alignment: if True, will check if input readers are aligned + correctly. If False, will not check alignment and trailing outputs + will be discarded. Defaults to True. + :type check_alignment: bool + + :return: the new data reader. + + :raises ComposeNotAligned: outputs of readers are not aligned. + Will not raise when check_alignment is set to False. + """ + check_alignment = kwargs.pop('check_alignment', True) + + def make_tuple(x): + if isinstance(x, tuple): + return x + else: + return (x, ) + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + if not check_alignment: + for outputs in itertools.izip(*rs): + yield sum(map(make_tuple, outputs), ()) + else: + for outputs in itertools.izip_longest(*rs): + for o in outputs: + if o is None: + # None will be not be present if compose is aligned + raise ComposeNotAligned( + "outputs of readers are not aligned.") + yield sum(map(make_tuple, outputs), ()) + + return reader + + +def buffered(reader, size): + """ + Creates a buffered data reader. + + The buffered data reader will read and save data entries into a + buffer. Reading from the buffered data reader will proceed as long + as the buffer is not empty. + + :param reader: the data reader to read from. + :type reader: callable + :param size: max buffer size. + :type size: int + + :returns: the buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def data_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return data_reader + + +def firstn(reader, n): + """ + Limit the max number of samples that reader could return. + + :param reader: the data reader to read from. + :type reader: callable + :param n: the max number of samples that return. + :type n: int + :return: the decorated reader. + :rtype: callable + """ + + # TODO(yuyang18): Check if just drop the reader, could clean the opened + # resource or not? + + def firstn_reader(): + for i, item in enumerate(reader()): + if i == n: + break + yield item + + return firstn_reader + + +class XmapEndSignal(): + pass + + +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): + """ + Use multiprocess to map samples from reader by a mapper defined by user. + And this function contains a buffered decorator. + :param mapper: a function to map sample. + :type mapper: callable + :param reader: the data reader to read from + :type reader: callable + :param process_num: process number to handle original sample + :type process_num: int + :param buffer_size: max buffer size + :type buffer_size: int + :param order: keep the order of reader + :type order: bool + :return: the decarated reader + :rtype: callable + """ + end = XmapEndSignal() + + # define a worker to read samples from reader to in_queue + def read_worker(reader, in_queue): + for i in reader(): + in_queue.put(i) + in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order, i)) + in_order += 1 + in_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue + def handle_worker(in_queue, out_queue, mapper): + sample = in_queue.get() + while not isinstance(sample, XmapEndSignal): + r = mapper(sample) + out_queue.put(r) + sample = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + def xreader(): + in_queue = Queue(buffer_size) + out_queue = Queue(buffer_size) + out_order = [0] + # start a read worker in a thread + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) + t.daemon = True + t.start() + # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) + workers = [] + for i in xrange(process_num): + worker = Thread(target=target, args=args) + worker.daemon = True + workers.append(worker) + for w in workers: + w.start() + + sample = out_queue.get() + while not isinstance(sample, XmapEndSignal): + yield sample + sample = out_queue.get() + finish = 1 + while finish < process_num: + sample = out_queue.get() + if isinstance(sample, XmapEndSignal): + finish += 1 + else: + yield sample + + return xreader + + +def _buf2lines(buf, line_break="\n"): + # FIXME: line_break should be automatically configured. + lines = buf.split(line_break) + return lines[:-1], lines[-1] + + +class PipeReader: + """ + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. + + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: + + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" + + An example: + + .. code-block:: python + + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample + """ + + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ + remained = "" + while True: + buff = self.process.stdout.read(self.bufsize) + if buff: + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": + decomp_buff = buff + else: + raise TypeError("file_type %s is not allowed" % + self.file_type) + + if cut_lines: + lines, remained = _buf2lines(''.join( + [remained, decomp_buff]), line_break) + for line in lines: + yield line + else: + yield decomp_buff + else: + break diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63 --- /dev/null +++ b/python/paddle/v2/reader/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +py_test(creator_test SRCS creator_test.py) +py_test(decorator_test SRCS decorator_test.py) diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2 --- /dev/null +++ b/python/paddle/v2/reader/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe374e663607607cd0839eb6ca9c70c4d15eef8 --- /dev/null +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -0,0 +1,74 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright PaddlePaddle contributors. All Rights Reservedd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import numpy as np +import paddle.v2.reader.creator + + +class TestNumpyArray(unittest.TestCase): + def test_numpy_array(self): + l = [[1, 2, 3], [4, 5, 6]] + x = np.array(l, np.int32) + reader = paddle.v2.reader.creator.np_array(x) + for idx, e in enumerate(reader()): + self.assertItemsEqual(e, l[idx]) + + +class TestTextFile(unittest.TestCase): + def test_text_file(self): + path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") + reader = paddle.v2.reader.creator.text_file(path) + for idx, e in enumerate(reader()): + self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) + + +class TestRecordIO(unittest.TestCase): + def do_test(self, path): + reader = paddle.v2.reader.creator.recordio(path) + idx = 0 + for e in reader(): + if idx == 0: + self.assertEqual(e, (1, 2, 3)) + elif idx == 1: + self.assertEqual(e, (4, 5, 6)) + idx += 1 + self.assertEqual(idx, 2) + + def test_recordIO(self): + self.do_test( + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat")) + self.do_test([ + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat") + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6b680e39f3fb299a14e7d8162470996d1d16b83d --- /dev/null +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -0,0 +1,178 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import unittest + +import paddle.v2.reader + + +def reader_creator_10(dur): + def reader(): + for i in range(10): + # this invocation helps testing paddle.reader.buffer + time.sleep(dur) + yield i + + return reader + + +class TestMap(unittest.TestCase): + def test_map(self): + d = {"h": 0, "i": 1} + + def tokenize(x): + return d[x] + + def read(): + yield "h" + yield "i" + + r = paddle.v2.reader.map_readers(tokenize, read) + for i, e in enumerate(r()): + self.assertEqual(e, i) + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.v2.reader.buffered(reader_creator_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.05) + last_time = time.time() + + +class TestCompose(unittest.TestCase): + def test_compse(self): + reader = paddle.v2.reader.compose( + reader_creator_10(0), reader_creator_10(0)) + for idx, e in enumerate(reader()): + self.assertEqual(e, (idx, idx)) + + def test_compose_not_aligned(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0)) + with self.assertRaises(paddle.v2.reader.ComposeNotAligned): + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + def test_compose_not_aligned_no_check(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0), + check_alignment=False) + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + +class TestChain(unittest.TestCase): + def test_chain(self): + c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) + idx = 0 + for e in c(): + self.assertEqual(e, idx % 10) + idx += 1 + self.assertEqual(idx, 20) + + +class TestShuffle(unittest.TestCase): + def test_shuffle(self): + case = [(0, True), (1, True), (10, False), (100, False)] + a = reader_creator_10(0) + for size, checkEq in case: + s = paddle.v2.reader.shuffle(a, size) + total = 0 + for idx, e in enumerate(s()): + if checkEq: + self.assertEqual(idx, e) + total += 1 + self.assertEqual(total, 10) + + +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + reader = paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(0), + tNum, size, order) + for n in xrange(3): + result = [] + for i in reader(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + + +class TestPipeReader(unittest.TestCase): + def test_pipe_reader(self): + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l + + import tempfile + + records = [str(i) for i in xrange(5)] + temp = tempfile.NamedTemporaryFile() + try: + with open(temp.name, 'w') as f: + for r in records: + f.write('%s\n' % r) + + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) + finally: + # delete the temporary file + temp.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31 --- /dev/null +++ b/python/paddle/v2/reader/tests/test_data_creator.txt @@ -0,0 +1,3 @@ +0 1 +2 3 +4 5 diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat new file mode 100644 index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat new file mode 100644 index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 46e4feb8e1ce1d12f214f5c49b1b589a46110603..b4333ed530ce464095ec38d72706949cc464fbe4 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,4 +1,5 @@ py_test(test_op SRCS test_op.py) +py_test(test_image SRCS test_image.py) py_test(test_layer SRCS test_layer.py) py_test(test_topology SRCS test_topology.py) py_test(test_rnn_layer SRCS test_rnn_layer.py) diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876 Binary files /dev/null and b/python/paddle/v2/tests/cat.jpg differ diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py new file mode 100644 index 0000000000000000000000000000000000000000..c78bbdc40a25878b21ba7e678afedf9d8f0a87cf --- /dev/null +++ b/python/paddle/v2/tests/test_image.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.v2.image as image + + +class Image(unittest.TestCase): + def test_resize_flip_chw(self): + # resize + im = image.load_image('cat.jpg') + im = image.resize_short(im, 256) + self.assertEqual(256, min(im.shape[:2])) + self.assertEqual(3, im.shape[2]) + + # flip + im = image.left_right_flip(im) + im2 = np.flip(im, 1) + self.assertEqual(im.all(), im2.all()) + + # to_chw + h, w, c = im.shape + im = image.to_chw(im) + self.assertEqual(c, im.shape[0]) + self.assertEqual(h, im.shape[1]) + self.assertEqual(w, im.shape[2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py index 8320217da2795da756cf12a80f39279182789eef..264442be182ea69c95b39b3bdb4c389d52eff66e 100644 --- a/python/paddle/v2/tests/test_paramconf_order.py +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -27,7 +27,6 @@ # limitations under the License. import unittest import math -import paddle.dataset as dataset import paddle.v2 as paddle @@ -41,7 +40,7 @@ def wordemb(inlayer): def train(): - word_dict = dataset.imikolov.build_dict() + word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) # Every layer takes integer value of range [0, dict_size) firstword = paddle.layer.data( diff --git a/python/setup.py.in b/python/setup.py.in index d73a3a6a1c41b87efb9600ac59983bd16547ec6a..08a448934d3248b46618acdef9e1894f94a93893 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': 'paddle.v2', 'paddle.v2.master', 'paddle.v2.plot', + 'paddle.v2.reader', + 'paddle.v2.dataset', 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: