diff --git a/.travis.yml b/.travis.yml index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,7 +34,7 @@ addons: - automake - libtool - ccache - ssh_known_hosts: 52.76.173.135 + ssh_known_hosts: 13.229.163.131 before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..cc31d098328bc237c018ebf8f158bdab5c37bff1 --- /dev/null +++ b/benchmark/fluid/machine_translation.py @@ -0,0 +1,349 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=True, + help="Whether to use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + pass + + +if __name__ == '__main__': + args = parser.parse_args() + if args.infer_only: + infer() + else: + train() diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7f7afaeb11447d936b65a1d83701b0176ecbc111 --- /dev/null +++ b/benchmark/fluid/mnist.py @@ -0,0 +1,205 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 1 +DTYPE = "float32" + +# random seed must set before configuring the network. +# fluid.default_startup_program().random_seed = SEED + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def eval_test(exe, batch_acc, batch_size_tensor, inference_program): + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + test_pass_acc = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), + data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_pass_acc.add(value=acc, weight=weight) + pass_acc = test_pass_acc.eval() + return pass_acc + + +def run_benchmark(model, args): + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + start_time = time.time() + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + opt.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + pass_start = time.time() + for batch_id, data in enumerate(train_reader()): + img_data = np.array( + map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + start = time.time() + outs = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor] + ) # The accuracy is the accumulation of batches, but not the current batch. + accuracy.add(value=outs[1], weight=outs[2]) + end = time.time() + loss = np.array(outs[0]) + acc = np.array(outs[1]) + print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % + (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + + pass_end = time.time() + + train_avg_acc = accuracy.eval() + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + + print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % + (pass_id, train_avg_acc, test_avg_acc, + (pass_end - pass_start) / 1000)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + run_benchmark(cnn_model, args) + else: + run_benchmark(cnn_model, args) diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f1db979fa7fb640679beacafd66dfbe1f62ab8 --- /dev/null +++ b/benchmark/fluid/resnet.py @@ -0,0 +1,323 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import functools +import numpy as np +import time + +import cProfile, pstats, StringIO + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler + + +def parse_args(): + parser = argparse.ArgumentParser('Convolution model benchmark.') + parser.add_argument( + '--model', + type=str, + choices=['resnet_imagenet', 'resnet_cifar10'], + default='resnet_imagenet', + help='The model architecture.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='use real data or fake data') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): + + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') + return out + + +def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') + return out + + +def run_benchmark(model, args): + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + + if args.data_set == "cifar10": + class_dim = 10 + if args.data_format == 'NCHW': + dshape = [3, 32, 32] + else: + dshape = [32, 32, 3] + else: + class_dim = 102 + if args.data_format == 'NCHW': + dshape = [3, 224, 224] + else: + dshape = [224, 224, 3] + + input = fluid.layers.data(name='data', shape=dshape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + predict = model(input, class_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + opts = optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + def test(exe): + test_accuracy = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + acc, weight = exe.run(inference_program, + feed={"data": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) + + return test_accuracy.eval() + + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + accuracy = fluid.average.WeightedAverage() + if args.use_fake_data: + data = train_reader().next() + image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype( + 'float32') + label = np.array(map(lambda x: x[1], data)).astype('int64') + label = label.reshape([-1, 1]) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + accuracy.reset() + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if not args.use_fake_data: + image = np.array(map(lambda x: x[0].reshape(dshape), + data)).astype('float32') + label = np.array(map(lambda x: x[1], data)).astype('int64') + label = label.reshape([-1, 1]) + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={'data': image, + 'label': label}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + iters += 1 + num_samples += label[0] + accuracy.add(value=acc, weight=weight) + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % + (pass_id, iters, loss, acc)) + pass_train_acc = accuracy.eval() + # evaluation + if args.with_test: + pass_test_acc = test(exe) + train_elapsed = time.time() - start_time + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + examples_per_sec = num_samples / train_elapsed + + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + + if args.use_cprof: + pr.disable() + s = StringIO.StringIO() + sortby = 'cumulative' + ps = pstats.Stats(pr, stream=s).sort_stats(sortby) + ps.print_stats() + print(s.getvalue()) + + +if __name__ == '__main__': + model_map = { + 'resnet_imagenet': resnet_imagenet, + 'resnet_cifar10': resnet_cifar10 + } + args = parse_args() + print_arguments(args) + if args.data_format == 'NHWC': + raise ValueError('Only support NCHW data_format now.') + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + run_benchmark(model_map[args.model], args) + else: + run_benchmark(model_map[args.model], args) diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..663e2efd5392a6cd1a71f51fa0d017070b489341 --- /dev/null +++ b/benchmark/fluid/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# This script benchmarking the PaddlePaddle Fluid on +# single thread single GPU. +export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib + +# disable openmp and mkl parallel +#https://github.com/PaddlePaddle/Paddle/issues/7199 +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi +# disable multi-gpu if have more than one +export CUDA_VISIBLE_DEVICES=0 +export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH + + +# vgg16 +# cifar10 gpu cifar10 128 +FLAGS_benchmark=true python fluid/vgg.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 > vgg16_gpu_128.log + +# resnet50 +# resnet50 gpu cifar10 128 +FLAGS_benchmark=true python fluid/resnet.py \ + --device=GPU \ + --batch_size=128 \ + --data_set=cifar10 \ + --model=resnet_cifar10 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 > resnet50_gpu_128.log + +# lstm diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..4e063549e0239abf9d946ed8735f0306203509d0 --- /dev/null +++ b/benchmark/fluid/stacked_dynamic_lstm.py @@ -0,0 +1,209 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import cPickle +import os +import random +import time + +import numpy +import paddle.v2 as paddle +import paddle.v2.dataset.imdb as imdb +import paddle.fluid as fluid +from paddle.v2 import batch +import paddle.fluid.profiler as profiler + + +def parse_args(): + parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--emb_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=100, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--device', + type=str, + default='CPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--crop_size', + type=int, + default=int(os.environ.get('CROP_SIZE', '1500')), + help='The max sentence length of input. Since this model use plain RNN,' + ' Gradient could be explored if sentence is too long') + args = parser.parse_args() + return args + + +word_dict = imdb.word_dict() + + +def crop_sentence(reader, crop_size): + unk_value = word_dict[''] + + def __impl__(): + for item in reader(): + if len([x for x in item[0] if x != unk_value]) < crop_size: + yield item + + return __impl__ + + +def main(): + args = parse_args() + lstm_size = args.hidden_dim + + data = fluid.layers.data( + name="words", shape=[1], lod_level=1, dtype='int64') + sentence = fluid.layers.embedding( + input=data, size=[len(word_dict), args.emb_dim]) + + sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + word = rnn.step_input(sentence) + prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) + prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) + + def gate_common( + ipt, + hidden, + size, ): + gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) + gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) + gate = fluid.layers.sums(input=[gate0, gate1]) + return gate + + forget_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + input_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + output_gate = fluid.layers.sigmoid( + x=gate_common(word, prev_hidden, lstm_size)) + cell_gate = fluid.layers.tanh( + x=gate_common(word, prev_hidden, lstm_size)) + + cell = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul( + x=input_gate, y=cell_gate) + ]) + + hidden = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell)) + + rnn.update_memory(prev_cell, cell) + rnn.update_memory(prev_hidden, hidden) + rnn.output(hidden) + + last = fluid.layers.sequence_pool(rnn(), 'last') + logit = fluid.layers.fc(input=last, size=2, act='softmax') + loss = fluid.layers.cross_entropy( + input=logit, + label=fluid.layers.data( + name='label', shape=[1], dtype='int64')) + loss = fluid.layers.mean(x=loss) + + # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ + shape=[1], dtype='int64'), total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + adam = fluid.optimizer.Adam() + adam.minimize(loss) + + fluid.memory_optimize(fluid.default_main_program()) + + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + def train_loop(pass_num, crop_size): + with profiler.profiler(args.device, 'total') as prof: + for pass_id in range(pass_num): + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), crop_size), + buf_size=25000), + batch_size=args.batch_size) + word_nums = 0 + pass_start_time = time.time() + for batch_id, data in enumerate(train_reader()): + tensor_words = to_lodtensor([x[0] for x in data], place) + for x in data: + word_nums += len(x[0]) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % + (pass_id, batch_id, loss_np, acc)) + + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + words_per_sec = word_nums / time_consumed + print("pass_id=%d, sec/pass: %f, words/s: %f" % + (pass_id, time_consumed, words_per_sec)) + + train_loop(args.pass_num, args.crop_size) + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = numpy.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +if __name__ == '__main__': + main() diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf78e4cf08d43127a05c740fa30ca6d2bc416b0 --- /dev/null +++ b/benchmark/fluid/vgg.py @@ -0,0 +1,220 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in Fluid""" +from __future__ import print_function + +import sys +import time +import numpy as np +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import argparse +import functools + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') +args = parser.parse_args() + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + return fc2 + + +def main(): + if args.data_set == "cifar10": + classdim = 10 + if args.data_format == 'NCHW': + data_shape = [3, 32, 32] + else: + data_shape = [32, 32, 3] + else: + classdim = 102 + if args.data_format == 'NCHW': + data_shape = [3, 224, 224] + else: + data_shape = [224, 224, 3] + + # Input data + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + net = vgg16_bn_drop(images) + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + opts = optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=args.batch_size) + + # test + def test(exe): + test_accuracy = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_accuracy.add(value=acc, weight=weight) + return test_accuracy.eval() + + iters, num_samples, start_time = 0, 0, time.time() + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + img_data = np.array(map(lambda x: x[0].reshape(data_shape), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor]) + accuracy.add(value=acc, weight=weight) + iters += 1 + num_samples += len(data) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + pass_train_acc = accuracy.eval() + train_losses.append(loss) + train_accs.append(acc) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + train_elapsed = time.time() - start_time + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == "__main__": + print_arguments() + main() diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644 --- a/doc/v2/faq/build_and_install/index_en.rst +++ b/doc/v2/faq/build_and_install/index_en.rst @@ -1,5 +1,143 @@ -############################ -Install, Build and Unit test -############################ +.. _install_faq: -TBD +############################### +Compile, Install, and Unit Test +############################### + +.. contents:: + +1. Insufficient CUDA driver version +---------------------------------------------------------------- + +Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory. +You can solve the issue by running the following commands: + +.. code-block:: bash + + $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu + +For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation `_ . + + +2. Version mismatch between PythonLibs and PythonInterpreter +---------------------------------------------------------------- + +It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows. + + .. code-block:: bash + + cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= + +You should specify ````, ````, ```` to your local paths. + +3. PaddlePaddle version is 0.0.0 +------------------------------------------------ +This issue would happen when you run the code `paddle version` or `cmake ..` + +.. code-block:: bash + + CMake Warning at cmake/version.cmake:20 (message): + Cannot add paddle version from git tag + +You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake` + +4. paddlepaddle\*.whl is not a supported wheel on this platform. +------------------------------------------------------------------------ + +The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1. + +You can upgrade Pip with the following command\: + +.. code-block:: bash + + pip install --upgrade pip + +If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation. + +If the system supports :code:`linux_x86_64` and the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest + +if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again. + + +5. ImportError: No module named v2 +---------------------------------- +Please uninstall Paddle V1 if you have installed it before. + +.. code-block:: bash + + pip uninstall py_paddle paddle + +Then install Python for PaddlePaddle , enter the build directory and run the following commands + +pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl + +6. Illegal instruction +----------------------- +This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version. + +7. Python unittest fails +-------------------------------- + +If the following python unittest testcases fail: + +.. code-block:: bash + + 24 - test_PyDataProvider (Failed) + 26 - test_RecurrentGradientMachine (Failed) + 27 - test_NetworkCompare (Failed) + 28 - test_PyDataProvider2 (Failed) + 32 - test_Prediction (Failed) + 33 - test_Compare (Failed) + 34 - test_Trainer (Failed) + 35 - test_TrainerOnePass (Failed) + 36 - test_CompareTwoNets (Failed) + 37 - test_CompareTwoOpts (Failed) + 38 - test_CompareSparse (Failed) + 39 - test_recurrent_machine_generation (Failed) + 40 - test_PyDataProviderWrapper (Failed) + 41 - test_config_parser (Failed) + 42 - test_swig_api (Failed) + 43 - layers_test (Failed) + +Please check the PaddlePaddle unittest logs which may suggest the following: + +.. code-block:: bash + + paddle package is already in your PYTHONPATH. But unittest need a clean environment. + Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. + +The solution is: + +* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory. Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package. + + +8. Failed to download the MKLML library +---------------------------------------------- + +.. code-block:: bash + + make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4 + make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2 + make[1]: *** waiting for the unfinished jobs.... + +Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully. + +The solution is: manually download and install, the specific steps are as follows. + +.. code-block:: bash + + // 1. enter the directory + cd build/third_party/mklml/src/extern_mklml + + // 2. check the size of the package, normally 75M, if less than 75M, the download fails + du -sh mklml_lnx_2018.0.1.20171007.tgz + + // 3. manually download and unzip and make the download success tag: + wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz + tar zxf mklml_lnx_2018.0.1.20171007.tgz + touch ../extern_mklml-stamp/extern_mklml-download + + // 4. then compile + diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 893cc15f6c8b34fcfc33554f8ef48ffeb00cd75c..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -22,7 +22,7 @@ namespace paddle { namespace framework { namespace details { -struct OpHandleBase; +class OpHandleBase; // VarHandleBase is the var node in the dependency graph. // A variable can only be generated by a single operator. i.e. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 91f2db9354c2a00ec7e51ea4595c7cfa00da23ea..577eea92d217f9feb54085b4d9f071494c3c5165 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" -#include -#include "ThreadPool.h" +#include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..9458d56a01df432aea573d796456b9be31350038 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -10,6 +10,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" @@ -52,7 +55,7 @@ class SelectedRows { private: // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. - // SelectedRows are simplely concated when adding together. Until a + // SelectedRows are simply concated when adding together. Until a // SelectedRows add a Tensor, will the duplicate rows be handled. Vector rows_; std::unique_ptr value_{nullptr}; diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index d59411dfb9122537e99f483478fdac06fc8275db..3adeeda90645ca983d9d9229b4cc1c4c90302206 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) + cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) endif() diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 6c2b78fb3a72adbaa6123601b0c524ce1c33b50a..ef987d07f08525bff5267cdc2076ae767417e4f1 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -74,8 +74,7 @@ void ProcGetResponse(const VarHandle& var_h, template void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray( - const_cast(reinterpret_cast(slice.begin()))); + proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); ::grpc::ByteBuffer tmp(&slice, 1); result->Swap(&tmp); } @@ -154,9 +153,10 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_); + s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, + &cq_); call->StartCall(); - call->Finish(&s->reply_, &s->status_, (void*)s); + call->Finish(&s->reply_, &s->status_, static_cast(s)); }); req_count_++; diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 0833710b35bf77b8c7fef34bfd3281d94b090014..19bba46e3bd49a689fbe1d0c93efe01806fb0228 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -131,6 +131,49 @@ class RequestGet final : public RequestBase { SimpleBlockQueue* queue_; }; +class RequestPrefetch final : public RequestBase { + public: + explicit RequestPrefetch(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + framework::Executor* executor, + framework::ProgramDesc* program, int blkid) + : RequestBase(service, cq, dev_ctx), + responder_(&ctx_), + scope_(scope), + executor_(executor), + program_(program), + blkid_(blkid) { + int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); + service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, + cq_, this); + } + + virtual ~RequestPrefetch() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + // prefetch process... + ::grpc::ByteBuffer reply; + // TODO(Yancey1989): execute the Block which containers prefetch ops + + VLOG(3) << "RequestPrefetch Process in"; + + responder_.Finish(reply, ::grpc::Status::OK, this); + status_ = FINISH; + } + + protected: + sendrecv::VariableMessage request_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + framework::Scope* scope_; + framework::Executor* executor_; + framework::ProgramDesc* program_; + int blkid_; +}; + void AsyncGRPCServer::WaitClientGet(int count) { int fetch_barriers = 0; while (fetch_barriers < count) { @@ -150,6 +193,7 @@ void AsyncGRPCServer::RunSyncUpdate() { cq_send_ = builder.AddCompletionQueue(); cq_get_ = builder.AddCompletionQueue(); + cq_prefetch_ = builder.AddCompletionQueue(); server_ = builder.BuildAndStart(); LOG(INFO) << "Server listening on " << address_ << std::endl; @@ -158,6 +202,8 @@ void AsyncGRPCServer::RunSyncUpdate() { std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); std::function get_register = std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); + std::function prefetch_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); t_send_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, @@ -166,17 +212,21 @@ void AsyncGRPCServer::RunSyncUpdate() { t_get_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_get_.get(), "cq_get", get_register))); - + t_prefetch_.reset(new std::thread( + std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), + "cq_prefetch", prefetch_register))); // wait server server_->Wait(); t_send_->join(); t_get_->join(); + t_prefetch_->join(); } void AsyncGRPCServer::ShutdownQueue() { std::unique_lock lock(cq_mutex_); cq_send_->Shutdown(); cq_get_->Shutdown(); + cq_prefetch_->Shutdown(); } // This URL explains why shutdown is complicate: @@ -189,6 +239,7 @@ void AsyncGRPCServer::ShutDown() { void AsyncGRPCServer::TryToRegisterNewSendOne() { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewSendOne"; return; } RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_, @@ -199,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() { void AsyncGRPCServer::TryToRegisterNewGetOne() { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewGetOne"; return; } RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_, @@ -206,6 +258,19 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { VLOG(4) << "Create RequestGet status:" << get->Status(); } +void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne"; + return; + } + RequestPrefetch* prefetch = + new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, + executor_, program_, prefetch_blk_id_); + + VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); +} + // FIXME(typhoonzero): change cq_name to enum. void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, const std::string& cq_name, @@ -214,11 +279,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, void* tag = NULL; bool ok = false; + while (true) { + VLOG(3) << "HandleRequest for " << cq_name << " while in"; if (!cq->Next(&tag, &ok)) { LOG(INFO) << cq_name << " CompletionQueue shutdown!"; break; } + VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; PADDLE_ENFORCE(tag); // FIXME(typhoonzero): de-couple the barriers with recv_op @@ -231,8 +299,8 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << cq_name << " recv no regular event:argument name" - << base->GetReqName(); + LOG(WARNING) << cq_name << " recv no regular event:argument name[" + << base->GetReqName() << "]"; TryToRegisterNewOne(); delete base; continue; diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 0fc9740ccb8652e30ac5287314113e34138ebad4..5b5033018c6aefdc165886fc4e9086ff0a7e9201 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -19,7 +19,9 @@ limitations under the License. */ #include #include "grpc++/grpc++.h" +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" @@ -55,6 +57,12 @@ class AsyncGRPCServer final { void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } + void SetProgram(framework::ProgramDesc *program) { program_ = program; } + + void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; } + + void SetExecutor(framework::Executor *executor) { executor_ = executor; } + const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } void Push(const std::string &msg_name) { @@ -69,6 +77,7 @@ class AsyncGRPCServer final { std::function TryToRegisterNewOne); void TryToRegisterNewSendOne(); void TryToRegisterNewGetOne(); + void TryToRegisterNewPrefetchOne(); void ShutdownQueue(); private: @@ -76,6 +85,7 @@ class AsyncGRPCServer final { volatile bool is_shut_down_ = false; std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_; GrpcService::AsyncService service_; std::unique_ptr<::grpc::Server> server_; @@ -86,6 +96,7 @@ class AsyncGRPCServer final { // received variable from RPC, operators fetch variable from this queue. SimpleBlockQueue var_get_queue_; + // client send variable to this queue. ReceivedQueue var_recv_queue_; // condition of the sub program @@ -95,6 +106,11 @@ class AsyncGRPCServer final { std::unique_ptr t_send_; std::unique_ptr t_get_; + std::unique_ptr t_prefetch_; + + int prefetch_blk_id_; + framework::ProgramDesc *program_; + framework::Executor *executor_; }; }; // namespace detail diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b89aed0157de8e95564015b3e7f42316a39537f5 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace detail = paddle::operators::detail; + +std::unique_ptr rpc_service_; + +void StartServer(const std::string& endpoint) { + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + rpc_service_->RunSyncUpdate(); +} + +TEST(PREFETCH, CPU) { + // start up a server instance backend + // TODO(Yancey1989): Need to start a server with optimize blocks and + // prefetch blocks. + std::thread server_thread(StartServer, "127.0.0.1:8889"); + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + // create var on local scope + std::string in_var_name("in"); + std::string out_var_name("out"); + auto* in_var = scope.Var(in_var_name); + auto* in_tensor = in_var->GetMutable(); + in_tensor->Resize({10, 10}); + VLOG(3) << "before mutable_data"; + in_tensor->mutable_data(place); + + scope.Var(out_var_name); + + VLOG(3) << "before fetch"; + detail::RPCClient client; + client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name, + out_var_name); + client.Wait(); + + rpc_service_->ShutDown(); + server_thread.join(); + rpc_service_.reset(nullptr); +} diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index acaefd92a26422fb231087ec5a605f06ad474017..e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -76,10 +76,11 @@ namespace detail { enum class GrpcMethod { kSendVariable, kGetVariable, + kPrefetchVariable, }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kGetVariable) + 1; + static_cast(GrpcMethod::kPrefetchVariable) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { @@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kPrefetchVariable: + return "/sendrecv.SendRecvService/PrefetchVariable"; } // Shouldn't be reached. diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 2d33f026e45c51d9a3812b2391381f74d6fddb29..fc12e82a7e6bd10262092d1ca367980df64e91c2 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -21,6 +21,8 @@ service SendRecvService { rpc SendVariable(VariableMessage) returns (VoidMessage) {} // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // Prefetch variable by Ids + rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index c27ea1268321744f6566b8b65e98f0df6d408186..b19add24e2bd325896a96be53d3d9762abfe217c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -13,22 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include -#include - -#include #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/simple_block_queue.h" -#include "paddle/fluid/string/printf.h" namespace paddle { namespace operators { @@ -111,6 +102,11 @@ class ListenAndServOp : public framework::OperatorBase { framework::Executor executor(dev_place); + // TODO(qiao) set proper fields for table lookup and update + rpc_service_->SetExecutor(&executor); + rpc_service_->SetPrefetchBlkdId(0); + rpc_service_->SetProgram(program); + // TODO(typhoonzero): change this to a while_op for every cluster-batch. bool exit_flag = false; // Record received sparse variables, so that @@ -173,7 +169,8 @@ class ListenAndServOp : public framework::OperatorBase { } ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); - VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts; + VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts + << "(ms)"; // Reset the received sparse variables, the sum operator would not // sum the input sparse variables which rows is empty at the next diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..deabcdc99f819851b2df9bb0c7b05a5b339568f3 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -18,6 +18,22 @@ limitations under the License. */ namespace paddle { namespace operators { +static inline framework::OpKernelType ExpectedKernelType( + const framework::ExecutionContext& ctx) { + auto* table_var = ctx.InputVar("W"); + if (table_var->IsType()) { + return framework::OpKernelType( + framework::ToDataType(table_var->Get().type()), + ctx.device_context()); + } else if (table_var->IsType()) { + return framework::OpKernelType( + framework::ToDataType(table_var->Get().value().type()), + ctx.device_context()); + } else { + PADDLE_THROW("W should be LoDTensor or SelectedRows"); + } +} + class LookupTableOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("W")->type()), - ctx.device_context()); + return ExpectedKernelType(ctx); } }; @@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "If the value is -1, it makes no effect to lookup. " "Otherwise the given value indicates padding the output " "with zeros whenever lookup encounters it in Ids.") - .SetDefault(-1); + .SetDefault(kNoPadding); AddComment(R"DOC( Lookup Table Operator. @@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("W")->type()), - ctx.device_context()); + return ExpectedKernelType(ctx); } }; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index c92ce78eeffb8f1517e61c6d6624d406e04d974d..fff5edda62d4b115605a4cab35ed5457b4db5f21 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -25,16 +28,37 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +static constexpr int64_t kNoPadding = -1; + +inline size_t getIndex(const std::vector &rows, int64_t value) { + auto it = std::find(rows.begin(), rows.end(), value); + PADDLE_ENFORCE(it != rows.end(), "id should be in rows"); + return static_cast(std::distance(rows.begin(), it)); +} template class LookupTableKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* table_t = context.Input("W"); - auto* ids_var = context.InputVar("Ids"); - Tensor* output_t = context.Output("Out"); + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + auto *ids_var = context.InputVar("Ids"); + Tensor *output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); + + DDim table_dim; - int64_t* ids; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW("table only support LoDTensor and SelectedRows"); + } + + int64_t *ids; int64_t ids_numel; // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type @@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel { // when Ids's type is SelectedRows, the rows of Ids contains the // ids to be looked up in W. if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); + auto *ids_t = context.Input("Ids"); + ids = const_cast(ids_t->data()); ids_numel = ids_t->numel(); } else if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().data()); + auto *ids_t = context.Input("Ids"); + ids = const_cast(ids_t->rows().data()); ids_numel = ids_t->rows().size(); - output_t->Resize({ids_numel, table_t->dims()[1]}); + output_t->Resize({ids_numel, table_dim[1]}); } else { PADDLE_THROW("Unsupported Variable Type of Ids"); } - int64_t padding_idx = context.Attr("padding_idx"); + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; - int N = table_t->dims()[0]; - int D = table_t->dims()[1]; - auto* table = table_t->data(); - auto* output = output_t->mutable_data(context.GetPlace()); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); - if (padding_idx == -1) { for (int64_t i = 0; i < ids_numel; ++i) { - PADDLE_ENFORCE_LT(ids[i], N); - PADDLE_ENFORCE_GE(ids[i], 0); - memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } } - } else { + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + for (int64_t i = 0; i < ids_numel; ++i) { - if (ids[i] == padding_idx) { - memset(output + i * D, 0, D * sizeof(T)); + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); } else { - PADDLE_ENFORCE_LT(ids[i], N); PADDLE_ENFORCE_GE(ids[i], 0); - memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + auto id_index = getIndex(table_t.rows(), ids[i]); + memcpy(output + i * row_width, table + id_index * row_width, + row_width * sizeof(T)); } } } @@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel { template class LookupTableGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW("table only support LoDTensor and SelectedRows"); + } + bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - auto* ids = context.Input("Ids"); - auto* table = context.Input("W"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); framework::Vector new_rows; @@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel { } d_table->set_rows(new_rows); - auto* d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_dim[0], table->dims()[1]}); + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table_dim[1]}); d_table_value->mutable_data(context.GetPlace()); - d_table->set_height(table->dims()[0]); + d_table->set_height(table_dim[0]); - auto* d_output_data = d_output->data(); - auto* d_table_data = d_table_value->data(); + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); } else { - auto* ids = context.Input("Ids"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); - auto* table = context.Input("W"); + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); - int N = table->dims()[0]; + int N = table_dim[0]; int D = d_output->dims()[1]; - auto* d_output_data = d_output->data(); - auto* d_table_data = d_table->mutable_data(context.GetPlace()); + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); memset(d_table_data, 0, d_table->numel() * sizeof(T)); diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index 90f6f955cea51ded2dbb2bde459113458d7749a4..20b8a5c98ab16ac8121cb2fd01deb8ecc1966d44 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include #include -#include -#include +#include // NOLINT +#include // NOLINT #include #include "paddle/fluid/framework/init.h" @@ -43,7 +43,7 @@ const f::DDim kDims = {20, 20}; // nccl op common tester, init communicator. class NCCLTester : public ::testing::Test { public: - virtual void SetUp() override { + void SetUp() override { int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) @@ -64,7 +64,7 @@ class NCCLTester : public ::testing::Test { NCCLInitOp(); } - virtual void TearDown() override { + void TearDown() override { for (auto &device_context : dev_ctxs_) { delete device_context; } @@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test { TEST_F(NCCLTester, ncclInitOp) {} // ncclAllReduceOp with desc +// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367 +/* TEST_F(NCCLTester, ncclAllReduceOp) { std::unique_ptr op2(new f::OpDesc); op2->SetType("ncclAllReduce"); @@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { } } } +*/ // ncclReduceOp with desc TEST_F(NCCLTester, ncclReduceOp) { @@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) { } // ncclBcastOp with desc +// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540 +/* TEST_F(NCCLTester, ncclBcastOp) { std::unique_ptr op2(new f::OpDesc); const int kRoot = 0; @@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) { ASSERT_NEAR(ct[j], result, 1e-5); } } +*/ diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 141a3eb93555c32efabc2465dc6daadf41c9d659..96c0c1cbe6d588364416925a7ab1bc8f90ac6fd7 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include // NOLINT + #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" @@ -20,12 +21,29 @@ namespace paddle { namespace operators { namespace reader { -static constexpr size_t kDoubleBufferSize = 2; +// 'Double buffer' means we shall maintain two batches of input data at the same +// time. So the kCacheSize shoul be at least 2. +static constexpr size_t kCacheSize = 2; +// There will be two bacthes out of the channel during training: +// 1. the one waiting to be sent to the channel +// 2. the one just be received from the channel, which is also being used by +// subsequent operators. +// So the channel size should be kChacheSize - 2 +static constexpr size_t kChannelSize = 0; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: struct Item { Item() : ctx_(nullptr) {} + Item(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + } + Item& operator=(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + return *this; + } std::vector payloads_; platform::DeviceContext* ctx_; @@ -34,42 +52,44 @@ class DoubleBufferReader : public framework::DecoratedReader { explicit DoubleBufferReader( ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { - for (size_t i = 0; i < kDoubleBufferSize; ++i) { - if (platform::is_gpu_place(place_)) { #ifdef PADDLE_WITH_CUDA + for (size_t i = 0; i < kCacheSize; ++i) { + if (platform::is_gpu_place(place_)) { ctxs_.emplace_back(new platform::CUDADeviceContext( boost::get(place_))); -#endif } } - - start_thread(); - } - - void start_thread() { - buffer_ = framework::MakeChannel(kDoubleBufferSize); - prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); +#endif + StartPrefetcher(); } + bool HasNext() const override; void ReadNext(std::vector* out) override; void ReInit() override; - ~DoubleBufferReader() { - buffer_->Close(); - prefetcher_.join(); - delete buffer_; + ~DoubleBufferReader() { EndPrefetcher(); } + + private: + void StartPrefetcher() { + channel_ = framework::MakeChannel(kChannelSize); + prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); } - bool HasNext() const override; + void EndPrefetcher() { + channel_->Close(); + if (prefetcher_.joinable()) { + prefetcher_.join(); + } + delete channel_; + channel_ = nullptr; + } - private: void PrefetchThreadFunc(); std::thread prefetcher_; - framework::Channel* buffer_; + framework::Channel* channel_; platform::Place place_; std::vector> ctxs_; - mutable Item local_buffer_; }; class CreateDoubleBufferReaderOp : public framework::OperatorBase { @@ -123,70 +143,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { } }; +bool DoubleBufferReader::HasNext() const { + while (!channel_->IsClosed() && !channel_->CanReceive()) { + } + return channel_->CanReceive(); +} + void DoubleBufferReader::ReadNext(std::vector* out) { if (!HasNext()) { PADDLE_THROW("There is no next data!"); } - if (local_buffer_.payloads_.empty()) { - buffer_->Receive(&local_buffer_); - } - *out = local_buffer_.payloads_; - local_buffer_.payloads_.clear(); - if (local_buffer_.ctx_) { - local_buffer_.ctx_->Wait(); + Item batch; + channel_->Receive(&batch); + *out = batch.payloads_; + if (batch.ctx_) { + batch.ctx_->Wait(); } } void DoubleBufferReader::ReInit() { reader_->ReInit(); - buffer_->Close(); - prefetcher_.join(); - delete buffer_; - start_thread(); + EndPrefetcher(); + StartPrefetcher(); } void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "A new prefetch thread starts."; - size_t gpu_ctx_offset = 0; + std::vector> cpu_tensor_cache(kCacheSize); + std::vector> gpu_tensor_cache(kCacheSize); + size_t cached_tensor_id = 0; + while (reader_->HasNext()) { Item batch; - reader_->ReadNext(&batch.payloads_); + auto& cpu_batch = cpu_tensor_cache[cached_tensor_id]; + reader_->ReadNext(&cpu_batch); if (platform::is_gpu_place(place_)) { - std::vector gpu_batch; - auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++]; - gpu_ctx_offset %= this->ctxs_.size(); - gpu_batch.resize(batch.payloads_.size()); - for (size_t i = 0; i < batch.payloads_.size(); ++i) { - framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx, - &gpu_batch[i]); - gpu_batch[i].set_lod(batch.payloads_[i].lod()); + auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; + auto* gpu_ctx = ctxs_[cached_tensor_id].get(); + gpu_batch.resize(cpu_batch.size()); + for (size_t i = 0; i < cpu_batch.size(); ++i) { + framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); + gpu_batch[i].set_lod(cpu_batch[i].lod()); } - batch.ctx_ = gpu_ctx.get(); - std::swap(gpu_batch, batch.payloads_); + batch.payloads_ = gpu_batch; + batch.ctx_ = gpu_ctx; + } else { + // CPUPlace + batch.payloads_ = cpu_batch; } + ++cached_tensor_id; + cached_tensor_id %= kCacheSize; try { - buffer_->Send(&batch); + channel_->Send(&batch); } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The double buffer channel has been closed. The " "prefetch thread will terminate."; break; } } - buffer_->Close(); + channel_->Close(); VLOG(5) << "Prefetch thread terminates."; } -bool DoubleBufferReader::HasNext() const { - if (local_buffer_.payloads_.empty()) { - bool ok = buffer_->Receive(&local_buffer_); - return ok; - } else { - return true; - } -} - } // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 832509641cc3d5178ff090e05437484d395bfe51..93f9c74b809770136d3d3300e0e0700b1bc0459e 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -14,93 +14,72 @@ limitations under the License. */ #include "paddle/fluid/operators/reshape_op.h" +#include +#include + namespace paddle { namespace operators { -class ReshapeOp : public framework::OperatorWithKernel { - public: - ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - // input check - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); - - auto shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty."); - auto x_dims = ctx->GetInputDim("X"); - - std::vector neg_dims_idx; - // set some dimension to -1 if it is unknown - const int unknown_size = -1; - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size, - "Each dimension of Attr(shape) must be positive or %d.", - unknown_size); - if (shape[i] == unknown_size) { - neg_dims_idx.push_back(i); - PADDLE_ENFORCE(neg_dims_idx.size() <= 1, - "Only one dimension of Attr(shape) can be unknown."); - } - } - - int64_t capacity = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - int64_t in_size = framework::product(x_dims); - if (neg_dims_idx.size() == 1) { - // dim infer - shape[neg_dims_idx[0]] = in_size / (-capacity); - // recalculate capacity - capacity = shape[neg_dims_idx[0]] * (-capacity); - } - // capacity check - PADDLE_ENFORCE(capacity == in_size, - "The size of Input(X) mismatches with Attr(shape)."); - // resize output - std::vector shape_int64(shape.size(), 0); - std::transform(shape.begin(), shape.end(), shape_int64.begin(), - [](int a) { return static_cast(a); }); - auto out_dims = framework::make_ddim(shape_int64); - ctx->SetOutputDim("Out", out_dims); - if (shape[0] == x_dims[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } -}; - class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { public: ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor of reshape operator."); - AddOutput("Out", "The output tensor of reshape operator."); - AddAttr>("shape", - "(vector) " - "Target shape of reshape operator."); + AddInput("X", "(Tensor). The input tensor of reshape operator."); + AddInput("Shape", + "(Tensor, optional). If provided, reshape according to " + "this given shape. That is to say it has a higher priority than " + "the shape attribute, while the shape attribute still should be " + "set correctly to gurantee shape inference in compile time.") + .AsDispensable(); + AddOutput("Out", "(Tensor). The output tensor of reshape operator."); + AddAttr>( + "shape", "(std::vector) Target shape of reshape operator."); AddAttr("inplace", - "Change the source tensor's shape without copy memory.") - .SetDefault(true); + "(default: false) Change the source tensor's shape without " + "memory copy. When Attr(inplace) is set true, the output " + "tensor shares memory with Input(X), otherwise, a new output " + "tensor is created, and its data are copied from Input(x).") + .SetDefault(false); AddComment(R"DOC( Reshape Operator. -Reshape Input(X) into the shape specified by Attr(shape). +Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The +data in Input(X) are unchanged. -An example: -Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] +Examples: -and target shape = [1, 4], the reshape operator will transform -the tensor X into a 2-D tensor: [[1, 2, 3, 4]] +1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X) +into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged. + +2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data +unchanged. In this case, one and only dimension of Attr(shape) can be set to -1, +the value of this dimension is inferred from the total element number of +Input(X) and remaining dimensions. + +3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data +unchanged. In this case, besides -1, 0 means the actual dimension value is going +to be copied from the corresponding dimension of Input(X). + +Note: + +1. One and only one dimension in Attr(shape) can be set -1. In this case, +the actual dimension value will be infered from the total element number of +Input(X) and remaining dimensions. + +2. More than one dimensions in Attr(shape) can be set to 0, which means the real +dimension value will be copied from Input(X) at runtime. Note that the index of +0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape +[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. + +3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while +Attr(shape) still should be set correctly to gurantee shape inference in +compile-time. -One dimension in the target shape can be set -1, representing that its -size is unknown. In this case, the real dimension will be infered from -the original shape of Input(X) and other dimensions in the target shape. )DOC"); } }; @@ -119,6 +98,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index eacb0a0cf21a60ffbdef5787434859ac549388bc..807e5ad951b893a4c027a96d743f0606b70cf160 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -14,23 +14,138 @@ limitations under the License. */ #pragma once +#include +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + const std::vector &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + + if (ctx->HasInput("Shape") && ctx->IsRuntime()) { + // If true, set the shape of Output(Out) according to Input(Shape) in + // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel. + ctx->ShareLoD("X", /*->*/ "Out"); + return; + } + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ValidateShape(shape, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + + static framework::DDim ValidateShape(const std::vector shape, + const framework::DDim &in_dims) { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE( + unk_dim_idx == -1, + "Only one input dimension of Attr(shape) can be unknown."); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE( + static_cast(i) < in_dims.size(), + "The index of dimension to copy from input shape must be less " + "than the size of input shape."); + } else { + PADDLE_ENFORCE( + shape[i] > 0, + "Each input dimension of Attr(shape) must not be negtive except " + "one unknown dimension."); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = + (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + template class ReshapeKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out = ctx.Output("Out"); - auto* in = ctx.Input("X"); + void Compute(const framework::ExecutionContext &ctx) const { + auto *out = ctx.Output("Out"); + auto *in = ctx.Input("X"); + auto *shape_tensor = ctx.Input("Shape"); + + framework::DDim out_dims = out->dims(); + if (shape_tensor) { + auto *shape_data = shape_tensor->data(); + if (platform::is_gpu_place(ctx.GetPlace())) { + framework::Tensor cpu_shape_tensor; + TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), + &cpu_shape_tensor); + shape_data = cpu_shape_tensor.data(); + } + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ReshapeOp::ValidateShape(shape, in->dims()); + } + if (!in->lod().empty()) { + PADDLE_ENFORCE_EQ( + out_dims[0], in->dims()[0], + "Reshape operator cannot reshape an input sequence batch " + "into an output sequence batch that has a different " + "number of time steps. Please consider using " + "sequence_reshape op."); + } + bool inplace = ctx.Attr("inplace"); - auto out_dims = out->dims(); + out->Resize(out_dims); if (!inplace) { out->mutable_data(ctx.GetPlace()); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); + // TensorCopy will resize to in_dims. out->Resize(out_dims); } else { out->ShareDataWith(*in); @@ -42,9 +157,10 @@ class ReshapeKernel : public framework::OpKernel { template class ReshapeGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_x = ctx.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const { + auto *d_out = ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); bool inplace = ctx.Attr("inplace"); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index f5ae553c8571e21b351d0f5507afdf1539843a51..d074b0136d77fa5a1ce5c29cd52347d04475b029 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -81,6 +81,7 @@ if (WITH_TESTING) # enable v2 API unittest only when paddle swig api is compiled add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/plot/tests) + add_subdirectory(paddle/v2/reader/tests) endif() endif() add_subdirectory(paddle/fluid/tests) diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 1fdfd49f1c970d89bfde9d12a24076d38c54ba66..3315e826e82a33dfeb9c5223ce196cffb1ae7234 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -37,7 +37,7 @@ __all__ = [ 'cifar', 'movielens', 'conll05', - 'sentiment' + 'sentiment', 'uci_housing', 'wmt14', 'wmt16', diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 24297ffe33bc720ff7b4f2b0dbd82452dc7e0ae2..9311fc9904eb730aa56e94a4e45a1479a67df641 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -276,20 +276,25 @@ class DistributeTranspiler: suff_idx = v.name.find(".trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] - pserver_program.global_block().create_var( + else: + orig_var_name = v.name + single_trainer_var = pserver_program.global_block().create_var( name=orig_var_name, persistable=True, type=v.type, dtype=v.dtype, shape=v.shape) - for trainer_id in xrange(self.trainers): - var = pserver_program.global_block().create_var( - name="%s.trainer_%d" % (orig_var_name, trainer_id), - persistable=False, - type=v.type, - dtype=v.dtype, - shape=v.shape) - recv_inputs.append(var) + if self.trainers > 1: + for trainer_id in xrange(self.trainers): + var = pserver_program.global_block().create_var( + name="%s.trainer_%d" % (orig_var_name, trainer_id), + persistable=False, + type=v.type, + dtype=v.dtype, + shape=v.shape) + recv_inputs.append(var) + else: + recv_inputs.append(single_trainer_var) # step3 optimize_block = pserver_program.create_block(0) @@ -511,8 +516,11 @@ class DistributeTranspiler: def _append_split_op(self, program, gradblocks): # Split variables that need to be split and append respective ops + add_suffix = False + if self.trainers > 1: + add_suffix = True var_mapping = self._create_vars_from_blocklist( - program, gradblocks, add_trainer_suffix=True) + program, gradblocks, add_trainer_suffix=add_suffix) for varname, splited_vars in var_mapping.iteritems(): # variable that don't need to split have empty splited_vars if len(splited_vars) <= 1: diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2612fb1ae41986ae0d5c6e942cc3accebcb00e19..54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -48,8 +48,7 @@ def as_numpy(tensor): assert isinstance(tensor, core.LoDTensor) lod = tensor.lod() if len(lod) > 0: - raise RuntimeError( - "Some of your featched tensors hold LoD information. \ + raise RuntimeError("Some of your fetched tensors hold LoD information. \ They can not be completely cast to Python ndarray. \ Please set the parameter 'return_numpy' as 'False' to \ return LoDTensor itself directly.") @@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list): class Executor(object): - def __init__(self, places): - if not isinstance(places, list) and not isinstance(places, tuple): - places = [places] - - act_places = [] - for each in places: - p = core.Place() - p.set_place(each) - act_places.append(p) - - # TODO(dzhwinter) : only use the first place - self.executor = core.Executor(act_places[0]) - self.places = places + def __init__(self, place): + self.place = place + p = core.Place() + p.set_place(place) + self.executor = core.Executor(p) self.program_caches = dict() - def aslodtensor(self, data): - def accumulate(data): - if not isinstance(data, list): - return 1 - return sum([accumulate(sub) for sub in data]) - - def parselod(data): - seq_lens = [accumulate(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - return lod - - assert len(self.places) != 0 - if not isinstance(data, list): - # pure tensor case - tensor = core.LoDTensor() - tensor.set(data, self.places[0]) - return tensor - else: - raise RuntimeError("Current implementation lacks unittests") - # lodtensor case - lod = [] - if not isinstance(data[0], list): - lod.append(parselod(data)) - flattened_data = np.concatenate(data, axis=0).astype("int64") - else: - while isinstance(data[0], list): - lod.append(parselod(seq)) - flattened_data = [item for seq in data for item in seq] - data = flattened_data - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - tensor = core.LoDTensor() - tensor.set(flattened_data, self.places[0]) - tensor.set_lod(lod) - return tensor + def as_lodtensor(self, data): + if isinstance(data, list): + raise RuntimeError("Some of your feed data hold LoD information. \ + They can not be completely cast from a list of Python \ + ndarray to LoDTensor. Please convert data to LoDTensor \ + directly before feeding the data.\ + ") + # single tensor case + tensor = core.LoDTensor() + tensor.set(data, self.place) + return tensor def _get_program_cache(self, program_cache_key): return self.program_caches.get(program_cache_key, None) @@ -293,7 +256,7 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.aslodtensor(cur_feed) + cur_feed = self.as_lodtensor(cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3e649dc5fd32c4ed8fa6ad273b7be04d552b51ae..a5938fe494265778ef7032c56a8d6d35acd729c5 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn from layer_function_generator import autodoc from ..layer_helper import LayerHelper import tensor -import ops import nn import math @@ -58,7 +57,7 @@ def detection_output(loc, This operation is to get the detection results by performing following two steps: - + 1. Decode input bounding box predictions according to the prior boxes. 2. Get the final detection results by applying multi-class non maximum suppression (NMS). @@ -130,9 +129,9 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') old_shape = scores.shape - scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) + scores = nn.reshape(x=scores, shape=(-1, old_shape[-1])) scores = nn.softmax(input=scores) - scores = ops.reshape(x=scores, shape=old_shape) + scores = nn.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) scores.stop_gradient = True nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) @@ -463,7 +462,7 @@ def ssd_loss(location, num, num_prior, num_class = confidence.shape def __reshape_to_2d(var): - return ops.reshape(x=var, shape=[-1, var.shape[-1]]) + return nn.reshape(x=var, shape=[-1, var.shape[-1]]) # 1. Find matched boundding box by prior box. # 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. @@ -474,7 +473,7 @@ def ssd_loss(location, # 2. Compute confidence for mining hard examples # 2.1. Get the target label based on matched indices - gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) + gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, )) gt_label.stop_gradient = True target_label, _ = target_assign( gt_label, matched_indices, mismatch_value=background_label) @@ -487,7 +486,7 @@ def ssd_loss(location, conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) + conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior)) conf_loss.stop_gradient = True neg_indices = helper.create_tmp_variable(dtype='int32') dtype = matched_indices.dtype @@ -556,7 +555,7 @@ def ssd_loss(location, # 5.3 Compute overall weighted loss. loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss # reshape to [N, Np], N is the batch size and Np is the prior box number. - loss = ops.reshape(x=loss, shape=[-1, num_prior]) + loss = nn.reshape(x=loss, shape=[-1, num_prior]) loss = nn.reduce_sum(loss, dim=1, keep_dim=True) if normalize: normalizer = nn.reduce_sum(target_loc_weight) @@ -709,7 +708,7 @@ def multi_box_head(inputs, new_shape = [ -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)]) ] - out = ops.reshape(x=input, shape=new_shape) + out = nn.reshape(x=input, shape=new_shape) return out def _is_list_or_tuple_(data): @@ -803,7 +802,7 @@ def multi_box_head(inputs, mbox_loc.shape[0], mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4 ] - mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape) + mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape) mbox_locs.append(mbox_loc_flatten) # get conf @@ -819,7 +818,7 @@ def multi_box_head(inputs, conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] * conf_loc.shape[3] / num_classes, num_classes ] - conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape) + conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape) mbox_confs.append(conf_loc_flatten) if len(box_results) == 1: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0332556f62c46b187bd79841e4969d9da08b57a5..e59ee251202d4985f0aa9ce46408fc7f0d197e1d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -73,6 +73,7 @@ __all__ = [ 'smooth_l1', 'one_hot', 'autoincreased_step_counter', + 'reshape', 'lod_reset', 'lrn', ] @@ -3265,6 +3266,8 @@ def one_hot(input, depth): The one-hot tensor or LodTensor, same as input. Examples: + .. code-block:: python + X is a LoDTensor: X.lod = [[0, 1, 4]] X.shape = [4, 1] @@ -3319,6 +3322,101 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): return counter +def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): + """ + Gives a new shape to the input Tensor without changing its data. + + The target shape can be given by :attr:`shape` or :attr:`actual_shape`. + :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor + variable. :attr:`actual_shape` has a higher priority than :attr:`shape` + if it is provided, while :attr:`shape` still should be set correctly to + gurantee shape inference in compile-time. + + Some tricks exist when specifying the target shape. + + 1. -1 means the value of this dimension is inferred from the total element + number of x and remaining dimensions. Thus one and only one dimension can + be set -1. + + 2. 0 means the actual dimension value is going to be copied from the + corresponding dimension of x. The indice of 0s in shape can not exceed + Rank(X). + + Here are some examples to explain it. + + 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + is [6, 8], the reshape operator will transform x into a 2-D tensor with + shape [6, 8] and leaving x's data unchanged. + + 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + specified is [2, 3, -1, 2], the reshape operator will transform x into a + 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining + dimensions. + + 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor + with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, + besides -1, 0 means the actual dimension value is going to be copied from + the corresponding dimension of x. + + Args: + input(variable): The input tensor. + shape(list): The new shape. At most one dimension of the new shape can + be -1. + actual_shape(variable): An optional input. If provided, reshape + according to this given shape rather than + :attr:`shape` specifying shape. That is to + say :attr:`actual_shape` has a higher priority + than :attr:`shape`. + act (str): The non-linear activation to be applied to output variable. + inplace(bool): If this flag is set true, a new output tensor is created + whose data is copied from input x, otherwise the output + shares data with input without copying. + + Returns(variable): The output tensor. + + Examples: + .. code-block:: python + data = fluid.layers.data( + name='data', shape=[2, 4, 6], dtype='float32') + reshaped = fluid.layers.reshape( + x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True) + """ + + if not (isinstance(shape, list) or isinstance(shape, tuple)): + raise ValueError("Input shape must be a python lsit or tuple.") + + # Validate the shape + unk_dim_idx = -1 + for dim_idx, dim_size in enumerate(shape): + if dim_size == -1: + assert unk_dim_idx == -1, ( + "Only one dimension in shape can be unknown.") + unk_dim_idx = dim_idx + elif dim_size == 0: + assert dim_idx < len(x.shape), ( + "The indice of 0s in shape can not exceed Rank(X).") + else: + assert dim_size > 0, ( + "Each dimension size given in shape must not be negtive " + "except one unknown dimension.") + + helper = LayerHelper("reshape", **locals()) + reshaped = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type="reshape", + inputs={"X": x, + "Shape": actual_shape} + if isinstance(actual_shape, Variable) else {"X": x}, + attrs={"shape": shape, + "inplace": inplace}, + outputs={"Out": reshaped}) + + return helper.append_activation(reshaped) + + def lod_reset(x, y=None, target_lod=None): """ LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 0e5987ee598158d189db8bc956b7e7fea2517554..a9fe25744cc0b385479c9366af1b731ec221dd5a 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -49,7 +49,6 @@ __activations__ = [ __all__ = [ 'mean', 'mul', - 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits', 'elementwise_add', diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8393f7827b1c7d361ebea72f2cfc6033268772f0..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): np.allclose( actual_t, expect_t, atol=atol), "Output (" + out_name + ") has diff at " + str(place) + - str(actual_t) + str(expect_t)) + str(actual_t) + "\n" + str(expect_t)) if isinstance(expect, tuple): self.assertListEqual(actual.lod(), expect[1], "Output (" + out_name + @@ -568,6 +568,6 @@ class OpTest(unittest.TestCase): fetch_list = [g for p, g in param_grad_list] executor = Executor(place) - return map( - np.array, - executor.run(prog, feed_dict, fetch_list, return_numpy=False)) + return map(np.array, + executor.run(prog, feed_dict, fetch_list, + return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py index ed920ad388ff0e01887404e70fe82565b4cd28fa..3f739afd2516fdc2bdf3711d4780a1196c6f3f13 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py @@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest): self.check_with_place(place) +class TestLookupTableWIsSelectedRows(OpTest): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Id Variable + ids_tensor = scope.var('Ids').get_tensor() + ids_array = np.array([[0], [4], [3], [5]]).astype("int64") + ids_tensor.set(ids_array, place) + + # create and initialize W Variable + rows = [0, 1, 2, 3, 4, 5, 6] + row_numel = 12 + + w_selected_rows = scope.var('W').get_selected_rows() + w_selected_rows.set_height(len(rows)) + w_selected_rows.set_rows(rows) + w_array = np.ones((len(rows), row_numel)).astype("float32") + for i in range(len(rows)): + w_array[i] *= i + ids_tensor = w_selected_rows.get_tensor() + ids_tensor.set(w_array, place) + + # create Out Variable + Out_tensor = scope.var('Out').get_tensor() + + # create and run lookup_table operator + lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out') + lookup_table.run(scope, place) + + # get result from Out + result_array = np.array(Out_tensor) + # all(): return True if all elements of the iterable are true (or if the iterable is empty) + for idx, row in enumerate(ids_array): + assert (row[0] == result_array[idx]).all() + + def test_w_is_selected_rows(self): + places = [core.CPUPlace()] + # currently only support CPU + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py old mode 100755 new mode 100644 diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py index 854238c6279528d8f3adf173140a47e233134f43..2ebceca7e4b7b824194d94180462870e6cfe6d21 100644 --- a/python/paddle/fluid/tests/unittests/test_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_recv_op.py @@ -23,7 +23,7 @@ import time class TestRecvOp(unittest.TestCase): - def test_send(self): + def no_test_send(self): # Run init_serv in a thread place = fluid.CPUPlace() p = Process(target=self.init_serv, args=(place, )) diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -14,15 +14,19 @@ import unittest import numpy as np + from op_test import OpTest class TestReshapeOp(OpTest): def setUp(self): + ori_shape = (2, 25) + new_shape = (5, 10) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [10 * 20]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): self.check_output() @@ -31,12 +35,33 @@ class TestReshapeOp(OpTest): self.check_grad(["X"], "Out") -class TestReshapeOpDimInfer(OpTest): +class TestReshapeOpDimInfer1(OpTest): def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [4, -1, 5]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInfer2(OpTest): + def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} def test_check_output(self): self.check_output() @@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest): class TestReshapeOpInplace(OpTest): def setUp(self): + ori_shape = (2, 25) + new_shape = (5, 10) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInferInplace1(OpTest): + def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [10 * 20], 'inplace': True} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): self.check_output() @@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest): self.check_grad(["X"], "Out") -class TestReshapeOpDimInferInplace(OpTest): +class TestReshapeOpDimInferInplace2(OpTest): def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpWithInputShape(OpTest): + def setUp(self): + ori_shape = (6, 5) + new_shape = (0, -1, 5) + actual_shape = (2, 3, 5) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [4, -1, 5], 'inplace': True} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = { + "X": np.random.random(ori_shape).astype("float32"), + "Shape": np.array( + actual_shape, dtype="int32") + } + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} def test_check_output(self): self.check_output() @@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest): self.check_grad(["X"], "Out") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py old mode 100755 new mode 100644 diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 02b0d077eefa431baed05c421a367ebe3581626c..df710c33d0c0ca16d358dac1eb42327e9cd4c7ae 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -22,13 +22,17 @@ import data_type import topology import networks import evaluator +from . import dataset +from . import reader from . import plot import attr import op import pooling import inference import networks +import minibatch import plot +import image import paddle.trainer.config_parser as cp __all__ = [ @@ -44,11 +48,14 @@ __all__ = [ 'data_type', 'attr', 'pooling', + 'dataset', + 'reader', 'topology', 'networks', 'infer', 'plot', 'evaluator', + 'image', 'master', ] @@ -146,3 +153,4 @@ def init(**kwargs): infer = inference.infer +batch = minibatch.batch diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38056fe0a9496bcb5de76634bbab267e324dc2a4 --- /dev/null +++ b/python/paddle/v2/dataset/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Dataset package. +""" + +import mnist +import imikolov +import imdb +import cifar +import movielens +import conll05 +import uci_housing +import sentiment +import wmt14 +import wmt16 +import mq2007 +import flowers +import voc2012 + +__all__ = [ + 'mnist', + 'imikolov', + 'imdb', + 'cifar', + 'movielens', + 'conll05', + 'sentiment', + 'uci_housing', + 'wmt14', + 'wmt16', + 'mq2007', + 'flowers', + 'voc2012', +] diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46 --- /dev/null +++ b/python/paddle/v2/dataset/cifar.py @@ -0,0 +1,139 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CIFAR dataset. + +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. + +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. + +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. + +""" + +import cPickle +import itertools +import numpy +import paddle.v2.dataset.common +import tarfile + +__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] + +URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' +CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' +CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a' +CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' +CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' + + +def reader_creator(filename, sub_name): + def read_batch(batch): + data = batch['data'] + labels = batch.get('labels', batch.get('fine_labels', None)) + assert labels is not None + for sample, label in itertools.izip(data, labels): + yield (sample / 255.0).astype(numpy.float32), int(label) + + def reader(): + with tarfile.open(filename, mode='r') as f: + names = (each_item.name for each_item in f + if sub_name in each_item.name) + + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + + return reader + + +def train100(): + """ + CIFAR-100 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 99]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'train') + + +def test100(): + """ + CIFAR-100 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + 'test') + + +def train10(): + """ + CIFAR-10 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'data_batch') + + +def test10(): + """ + CIFAR-10 test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'test_batch') + + +def fetch(): + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py new file mode 100644 index 0000000000000000000000000000000000000000..c6ff09a1d1e3ca56877e986c3ed3ae9ecd0a7316 --- /dev/null +++ b/python/paddle/v2/dataset/common.py @@ -0,0 +1,236 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import hashlib +import os +import errno +import shutil +import sys +import importlib +import paddle.v2.dataset +import cPickle +import glob +import cPickle as pickle + +__all__ = [ + 'DATA_HOME', + 'download', + 'md5file', + 'split', + 'cluster_files_reader', + 'convert', +] + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + + +# When running unit tests, there could be multiple processes that +# trying to create DATA_HOME directory simultaneously, so we cannot +# use a if condition to check for the existence of the directory; +# instead, we use the filesystem as the synchronization mechanism by +# catching returned errors. +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) + + +def md5file(fname): + hash_md5 = hashlib.md5() + f = open(fname, "rb") + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + f.close() + return hash_md5.hexdigest() + + +def download(url, module_name, md5sum, save_name=None): + dirname = os.path.join(DATA_HOME, module_name) + if not os.path.exists(dirname): + os.makedirs(dirname) + + filename = os.path.join(dirname, + url.split('/')[-1] + if save_name is None else save_name) + + retry = 0 + retry_limit = 3 + while not (os.path.exists(filename) and md5file(filename) == md5sum): + if os.path.exists(filename): + print "file md5", md5file(filename), md5sum + if retry < retry_limit: + retry += 1 + else: + raise RuntimeError("Cannot download {0} within retry limit {1}". + format(url, retry_limit)) + print "Cache file %s not found, downloading %s" % (filename, url) + r = requests.get(url, stream=True) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(filename, 'w') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(filename, 'w') as f: + dl = 0 + total_length = int(total_length) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + done = int(50 * dl / total_length) + sys.stdout.write("\r[%s%s]" % ('=' * done, + ' ' * (50 - done))) + sys.stdout.flush() + + return filename + + +def fetch_all(): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "fetch" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)): + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "fetch")() + + +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + +def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): + """ + you can call the function as: + + split(paddle.v2.dataset.cifar.train10(), line_count=1000, + suffix="imikolov-train-%05d.pickle") + + the output files as: + + |-imikolov-train-00000.pickle + |-imikolov-train-00001.pickle + |- ... + |-imikolov-train-00480.pickle + + :param reader: is a reader creator + :param line_count: line count for each file + :param suffix: the suffix for the output files, should contain "%d" + means the id for each file. Default is "%05d.pickle" + :param dumper: is a callable function that dump object to file, this + function will be called as dumper(obj, f) and obj is the object + will be dumped, f is a file object. Default is cPickle.dump. + """ + if not callable(dumper): + raise TypeError("dumper should be callable.") + lines = [] + indx_f = 0 + for i, d in enumerate(reader()): + lines.append(d) + if i >= line_count and i % line_count == 0: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + lines = [] + indx_f += 1 + if lines: + with open(suffix % indx_f, "w") as f: + dumper(lines, f) + + +def cluster_files_reader(files_pattern, + trainer_count, + trainer_id, + loader=cPickle.load): + """ + Create a reader that yield element from the given files, select + a file set according trainer count and trainer_id + + :param files_pattern: the files which generating by split(...) + :param trainer_count: total trainer count + :param trainer_id: the trainer rank id + :param loader: is a callable function that load object from file, this + function will be called as loader(f) and f is a file object. + Default is cPickle.load + """ + + def reader(): + if not callable(loader): + raise TypeError("loader should be callable.") + file_list = glob.glob(files_pattern) + file_list.sort() + my_file_list = [] + for idx, fn in enumerate(file_list): + if idx % trainer_count == trainer_id: + print "append file: %s" % fn + my_file_list.append(fn) + for fn in my_file_list: + with open(fn, "r") as f: + lines = loader(f) + for line in lines: + yield line + + return reader + + +def convert(output_path, reader, line_count, name_prefix): + import recordio + """ + Convert data from reader to recordio format files. + + :param output_path: directory in which output files will be saved. + :param reader: a data reader, from which the convert program will read + data instances. + :param name_prefix: the name prefix of generated files. + :param max_lines_to_shuffle: the max lines numbers to shuffle before + writing. + """ + + assert line_count >= 1 + indx_f = 0 + + def write_data(indx_f, lines): + filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f) + writer = recordio.writer(filename) + for l in lines: + # FIXME(Yancey1989): + # dumps with protocol: pickle.HIGHEST_PROTOCOL + writer.write(cPickle.dumps(l)) + writer.close() + + lines = [] + for i, d in enumerate(reader()): + lines.append(d) + if i % line_count == 0 and i >= line_count: + write_data(indx_f, lines) + lines = [] + indx_f += 1 + continue + + write_data(indx_f, lines) diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py new file mode 100644 index 0000000000000000000000000000000000000000..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc --- /dev/null +++ b/python/paddle/v2/dataset/conll05.py @@ -0,0 +1,257 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. +""" + +import tarfile +import gzip +import itertools +import paddle.v2.dataset.common + +__all__ = ['test, get_dict', 'get_embedding', 'convert'] + +DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' +DATA_MD5 = '387719152ae52d60422c016e92a742fc' +WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' +WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' +VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' +VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' +TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' +TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' +EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' +EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' + +UNK_IDX = 0 + + +def load_label_dict(filename): + d = dict() + tag_dict = set() + with open(filename, 'r') as f: + for i, line in enumerate(f): + line = line.strip() + if line.startswith("B-"): + tag_dict.add(line[2:]) + elif line.startswith("I-"): + tag_dict.add(line[2:]) + index = 0 + for tag in tag_dict: + d["B-" + tag] = index + index += 1 + d["I-" + tag] = index + index += 1 + d["O"] = index + return d + + +def load_dict(filename): + d = dict() + with open(filename, 'r') as f: + for i, line in enumerate(f): + d[line.strip()] = i + return d + + +def corpus_reader(data_path, words_name, props_name): + """ + Read one corpus. It returns an iterator. Each element of + this iterator is a tuple including sentence and labels. The sentence is + consist of a list of word IDs. The labels include a list of label IDs. + :return: a iterator of data. + :rtype: iterator + """ + + def reader(): + tf = tarfile.open(data_path) + wf = tf.extractfile(words_name) + pf = tf.extractfile(props_name) + with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( + fileobj=pf) as props_file: + sentences = [] + labels = [] + one_seg = [] + for word, label in itertools.izip(words_file, props_file): + word = word.strip() + label = label.strip().split() + + if len(label) == 0: # end of sentence + for i in xrange(len(one_seg[0])): + a_kind_lable = [x[i] for x in one_seg] + labels.append(a_kind_lable) + + if len(labels) >= 1: + verb_list = [] + for x in labels[0]: + if x != '-': + verb_list.append(x) + + for i, lbl in enumerate(labels[1:]): + cur_tag = 'O' + is_in_bracket = False + lbl_seq = [] + verb_word = '' + for l in lbl: + if l == '*' and is_in_bracket == False: + lbl_seq.append('O') + elif l == '*' and is_in_bracket == True: + lbl_seq.append('I-' + cur_tag) + elif l == '*)': + lbl_seq.append('I-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') != -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = False + elif l.find('(') != -1 and l.find(')') == -1: + cur_tag = l[1:l.find('*')] + lbl_seq.append('B-' + cur_tag) + is_in_bracket = True + else: + raise RuntimeError('Unexpected label: %s' % + l) + + yield sentences, verb_list[i], lbl_seq + + sentences = [] + labels = [] + one_seg = [] + else: + sentences.append(word) + one_seg.append(label) + + pf.close() + wf.close() + tf.close() + + return reader + + +def reader_creator(corpus_reader, + word_dict=None, + predicate_dict=None, + label_dict=None): + def reader(): + for sentence, predicate, labels in corpus_reader(): + + sen_len = len(sentence) + + verb_index = labels.index('B-V') + mark = [0] * len(labels) + if verb_index > 0: + mark[verb_index - 1] = 1 + ctx_n1 = sentence[verb_index - 1] + else: + ctx_n1 = 'bos' + + if verb_index > 1: + mark[verb_index - 2] = 1 + ctx_n2 = sentence[verb_index - 2] + else: + ctx_n2 = 'bos' + + mark[verb_index] = 1 + ctx_0 = sentence[verb_index] + + if verb_index < len(labels) - 1: + mark[verb_index + 1] = 1 + ctx_p1 = sentence[verb_index + 1] + else: + ctx_p1 = 'eos' + + if verb_index < len(labels) - 2: + mark[verb_index + 2] = 1 + ctx_p2 = sentence[verb_index + 2] + else: + ctx_p2 = 'eos' + + word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] + + ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + pred_idx = [predicate_dict.get(predicate)] * sen_len + label_idx = [label_dict.get(w) for w in labels] + + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx + + return reader + + +def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ + word_dict = load_dict( + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', + WORDDICT_MD5)) + verb_dict = load_dict( + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', + VERBDICT_MD5)) + label_dict = load_label_dict( + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', + TRGDICT_MD5)) + return word_dict, verb_dict, label_dict + + +def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ + return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + + +def test(): + """ + Conll05 test set creator. + + Because the training dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. + + :return: Training reader creator + :rtype: callable + """ + word_dict, verb_dict, label_dict = get_dict() + reader = corpus_reader( + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), + words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', + props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') + return reader_creator(reader, word_dict, verb_dict, label_dict) + + +def fetch(): + paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdddeaabec733ef26b3f766c6437f5c53d65044 --- /dev/null +++ b/python/paddle/v2/dataset/flowers.py @@ -0,0 +1,199 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module will download dataset from +http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html +and parse train/test set intopaddle reader creators. + +This set contains images of flowers belonging to 102 different categories. +The images were acquired by searching the web and taking pictures. There are a +minimum of 40 images for each category. + +The database was used in: + +Nilsback, M-E. and Zisserman, A. Automated flower classification over a large + number of classes.Proceedings of the Indian Conference on Computer Vision, +Graphics and Image Processing (2008) +http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. + +""" +import cPickle +import itertools +import functools +from common import download +import tarfile +import scipy.io as scio +from paddle.v2.image import * +from paddle.v2.reader import * +import os +import numpy as np +from multiprocessing import cpu_count +__all__ = ['train', 'test', 'valid'] + +DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' +LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' +SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' +LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' +SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' +# In official 'readme', tstid is the flag of test data +# and trnid is the flag of train data. But test data is more than train data. +# So we exchange the train data and test data. +TRAIN_FLAG = 'tstid' +TEST_FLAG = 'trnid' +VALID_FLAG = 'valid' + + +def default_mapper(is_train, sample): + ''' + map image bytes data to type needed by model input layer + ''' + img, label = sample + img = load_image_bytes(img) + img = simple_transform( + img, 256, 224, is_train, mean=[103.94, 116.78, 123.68]) + return img.flatten().astype('float32'), label + + +train_mapper = functools.partial(default_mapper, True) +test_mapper = functools.partial(default_mapper, False) + + +def reader_creator(data_file, + label_file, + setid_file, + dataset_name, + mapper, + buffered_size=1024, + use_xmap=True): + ''' + 1. read images from tar file and + merge images into batch files in 102flowers.tgz_batch/ + 2. get a reader to read sample from batch file + + :param data_file: downloaded data file + :type data_file: string + :param label_file: downloaded label file + :type label_file: string + :param setid_file: downloaded setid file containing information + about how to split dataset + :type setid_file: string + :param dataset_name: data set name (tstid|trnid|valid) + :type dataset_name: string + :param mapper: a function to map image bytes data to type + needed by model input layer + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: data reader + :rtype: callable + ''' + labels = scio.loadmat(label_file)['labels'][0] + indexes = scio.loadmat(setid_file)[dataset_name][0] + img2label = {} + for i in indexes: + img = "jpg/image_%05d.jpg" % i + img2label[img] = labels[i - 1] + file_list = batch_images_from_tar(data_file, dataset_name, img2label) + + def reader(): + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + + if use_xmap: + return xmap_readers(mapper, reader, cpu_count(), buffered_size) + else: + return map_readers(mapper, reader) + + +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers training set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: train data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, + buffered_size, use_xmap) + + +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers test set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, + buffered_size, use_xmap) + + +def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): + ''' + Create flowers validation set reader. + It returns a reader, each sample in the reader is + image pixels in [0, 1] and label in [1, 102] + translated from original color image by steps: + 1. resize to 256*256 + 2. random crop to 224*224 + 3. flatten + :param mapper: a function to map sample. + :type mapper: callable + :param buffered_size: the size of buffer used to process images + :type buffered_size: int + :return: test data reader + :rtype: callable + ''' + return reader_creator( + download(DATA_URL, 'flowers', DATA_MD5), + download(LABEL_URL, 'flowers', LABEL_MD5), + download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, + buffered_size, use_xmap) + + +def fetch(): + download(DATA_URL, 'flowers', DATA_MD5) + download(LABEL_URL, 'flowers', LABEL_MD5) + download(SETID_URL, 'flowers', SETID_MD5) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py new file mode 100644 index 0000000000000000000000000000000000000000..37c4296f9bcea7e16daa46f778934331513c30c4 --- /dev/null +++ b/python/paddle/v2/dataset/imdb.py @@ -0,0 +1,148 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +IMDB dataset. + +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. +""" + +import paddle.v2.dataset.common +import collections +import tarfile +import re +import string + +__all__ = ['build_dict', 'train', 'test', 'convert'] + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', + MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip("\n\r").translate( + None, string.punctuation).lower().split() + tf = tarf.next() + + +def build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + for doc in tokenize(pattern): + for word in doc: + word_freq[word] += 1 + + # Not sure if we should prune less-frequent words here. + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx) + + +def word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'imdb', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py new file mode 100644 index 0000000000000000000000000000000000000000..617c722c4165cdfed9e650fc968d623ef6ed4391 --- /dev/null +++ b/python/paddle/v2/dataset/imikolov.py @@ -0,0 +1,161 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +imikolov's simple dataset. + +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set +into paddle reader creators. +""" +import paddle.v2.dataset.common +import collections +import tarfile + +__all__ = ['train', 'test', 'build_dict', 'convert'] + +URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' +MD5 = '30177ea32e27c525793142b6bf2c8e2d' + + +class DataType(object): + NGRAM = 1 + SEQ = 2 + + +def word_count(f, word_freq=None): + if word_freq is None: + word_freq = collections.defaultdict(int) + + for l in f: + for w in l.strip().split(): + word_freq[w] += 1 + word_freq[''] += 1 + word_freq[''] += 1 + + return word_freq + + +def build_dict(min_word_freq=50): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + train_filename = './simple-examples/data/ptb.train.txt' + test_filename = './simple-examples/data/ptb.valid.txt' + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + trainf = tf.extractfile(train_filename) + testf = tf.extractfile(test_filename) + word_freq = word_count(testf, word_count(trainf)) + if '' in word_freq: + # remove for now, since we will set it as last index + del word_freq[''] + + word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items()) + + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + + return word_idx + + +def reader_creator(filename, word_idx, n, data_type): + def reader(): + with tarfile.open( + paddle.v2.dataset.common.download( + paddle.v2.dataset.imikolov.URL, 'imikolov', + paddle.v2.dataset.imikolov.MD5)) as tf: + f = tf.extractfile(filename) + + UNK = word_idx[''] + for l in f: + if DataType.NGRAM == data_type: + assert n > -1, 'Invalid gram length' + l = [''] + l.strip().split() + [''] + if len(l) >= n: + l = [word_idx.get(w, UNK) for w in l] + for i in range(n, len(l) + 1): + yield tuple(l[i - n:i]) + elif DataType.SEQ == data_type: + l = l.strip().split() + l = [word_idx.get(w, UNK) for w in l] + src_seq = [word_idx['']] + l + trg_seq = l + [word_idx['']] + if n > 0 and len(src_seq) > n: continue + yield src_seq, trg_seq + else: + assert False, 'Unknow data type' + + return reader + + +def train(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov training set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Training reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n, + data_type) + + +def test(word_idx, n, data_type=DataType.NGRAM): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size if type is ngram, otherwise max length of sequence + :type n: int + :param data_type: data type (ngram or sequence) + :type data_type: member variable of DataType (NGRAM or SEQ) + :return: Test reader creator + :rtype: callable + """ + return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n, + data_type) + + +def fetch(): + paddle.v2.dataset.common.download(URL, "imikolov", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + N = 5 + word_dict = build_dict() + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 1000, + "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..9f675bed895223e054cd3bb6e504fe1607f19858 --- /dev/null +++ b/python/paddle/v2/dataset/mnist.py @@ -0,0 +1,123 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MNIST dataset. + +This module will download dataset from http://yann.lecun.com/exdb/mnist/ and +parse training set and test set into paddle reader creators. +""" +import paddle.v2.dataset.common +import subprocess +import numpy +import platform +__all__ = ['train', 'test', 'convert'] + +URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' +TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' +TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3' +TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz' +TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c' +TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz' +TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873' +TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz' +TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432' + + +def reader_creator(image_filename, label_filename, buffer_size): + def reader(): + if platform.system() == 'Darwin': + zcat_cmd = 'gzcat' + elif platform.system() == 'Linux': + zcat_cmd = 'zcat' + else: + raise NotImplementedError() + + # According to http://stackoverflow.com/a/38061619/724872, we + # cannot use standard package gzip here. + m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE) + m.stdout.read(16) # skip some magic bytes + + l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE) + l.stdout.read(8) # skip some magic bytes + + try: # reader could be break. + while True: + labels = numpy.fromfile( + l.stdout, 'ubyte', count=buffer_size).astype("int") + + if labels.size != buffer_size: + break # numpy.fromfile returns empty slice after EOF. + + images = numpy.fromfile( + m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape( + (buffer_size, 28 * 28)).astype('float32') + + images = images / 255.0 * 2.0 - 1.0 + + for i in xrange(buffer_size): + yield images[i, :], int(labels[i]) + finally: + m.terminate() + l.terminate() + + return reader + + +def train(): + """ + MNIST training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', + TRAIN_IMAGE_MD5), + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', + TRAIN_LABEL_MD5), 100) + + +def test(): + """ + MNIST test set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', + TEST_IMAGE_MD5), + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', + TEST_LABEL_MD5), 100) + + +def fetch(): + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py new file mode 100644 index 0000000000000000000000000000000000000000..5b61a9420af1bb81e1d826f8a7b69f34c306d382 --- /dev/null +++ b/python/paddle/v2/dataset/movielens.py @@ -0,0 +1,262 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Movielens 1-M dataset. + +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training +set and test set into paddle reader creators. + +""" + +import zipfile +import paddle.v2.dataset.common +import re +import random +import functools + +__all__ = [ + 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' +] + +age_table = [1, 18, 25, 35, 45, 50, 56] + +URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' +MD5 = 'c4d9eecfca2ab87c1945afe126590906' + + +class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ + + def __init__(self, index, categories, title): + self.index = int(index) + self.categories = categories + self.title = title + + def value(self): + """ + Get information from a movie. + """ + return [ + self.index, [CATEGORIES_DICT[c] for c in self.categories], + [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] + ] + + def __str__(self): + return "" % ( + self.index, self.title, self.categories) + + def __repr__(self): + return self.__str__() + + +class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ + + def __init__(self, index, gender, age, job_id): + self.index = int(index) + self.is_male = gender == 'M' + self.age = age_table.index(int(age)) + self.job_id = int(job_id) + + def value(self): + """ + Get information from a user. + """ + return [self.index, 0 if self.is_male else 1, self.age, self.job_id] + + def __str__(self): + return "" % ( + self.index, "M" + if self.is_male else "F", age_table[self.age], self.job_id) + + def __repr__(self): + return str(self) + + +MOVIE_INFO = None +MOVIE_TITLE_DICT = None +CATEGORIES_DICT = None +USER_INFO = None + + +def __initialize_meta_info__(): + fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) + global MOVIE_INFO + if MOVIE_INFO is None: + pattern = re.compile(r'^(.*)\((\d+)\)$') + with zipfile.ZipFile(file=fn) as package: + for info in package.infolist(): + assert isinstance(info, zipfile.ZipInfo) + MOVIE_INFO = dict() + title_word_set = set() + categories_set = set() + with package.open('ml-1m/movies.dat') as movie_file: + for i, line in enumerate(movie_file): + movie_id, title, categories = line.strip().split('::') + categories = categories.split('|') + for c in categories: + categories_set.add(c) + title = pattern.match(title).group(1) + MOVIE_INFO[int(movie_id)] = MovieInfo( + index=movie_id, categories=categories, title=title) + for w in title.split(): + title_word_set.add(w.lower()) + + global MOVIE_TITLE_DICT + MOVIE_TITLE_DICT = dict() + for i, w in enumerate(title_word_set): + MOVIE_TITLE_DICT[w] = i + + global CATEGORIES_DICT + CATEGORIES_DICT = dict() + for i, c in enumerate(categories_set): + CATEGORIES_DICT[c] = i + + global USER_INFO + USER_INFO = dict() + with package.open('ml-1m/users.dat') as user_file: + for line in user_file: + uid, gender, age, job, _ = line.strip().split("::") + USER_INFO[int(uid)] = UserInfo( + index=uid, gender=gender, age=age, job_id=job) + return fn + + +def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): + fn = __initialize_meta_info__() + rand = random.Random(x=rand_seed) + with zipfile.ZipFile(file=fn) as package: + with package.open('ml-1m/ratings.dat') as rating: + for line in rating: + if (rand.random() < test_ratio) == is_test: + uid, mov_id, rating, _ = line.strip().split("::") + uid = int(uid) + mov_id = int(mov_id) + rating = float(rating) * 2 - 5.0 + + mov = MOVIE_INFO[mov_id] + usr = USER_INFO[uid] + yield usr.value() + mov.value() + [[rating]] + + +def __reader_creator__(**kwargs): + return lambda: __reader__(**kwargs) + + +train = functools.partial(__reader_creator__, is_test=False) +test = functools.partial(__reader_creator__, is_test=True) + + +def get_movie_title_dict(): + """ + Get movie title dictionary. + """ + __initialize_meta_info__() + return MOVIE_TITLE_DICT + + +def __max_index_info__(a, b): + if a.index > b.index: + return a + else: + return b + + +def max_movie_id(): + """ + Get the maximum value of movie id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index + + +def max_user_id(): + """ + Get the maximum value of user id. + """ + __initialize_meta_info__() + return reduce(__max_index_info__, USER_INFO.viewvalues()).index + + +def __max_job_id_impl__(a, b): + if a.job_id > b.job_id: + return a + else: + return b + + +def max_job_id(): + """ + Get the maximum value of job id. + """ + __initialize_meta_info__() + return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id + + +def movie_categories(): + """ + Get movie categoriges dictionary. + """ + __initialize_meta_info__() + return CATEGORIES_DICT + + +def user_info(): + """ + Get user info dictionary. + """ + __initialize_meta_info__() + return USER_INFO + + +def movie_info(): + """ + Get movie info dictionary. + """ + __initialize_meta_info__() + return MOVIE_INFO + + +def unittest(): + for train_count, _ in enumerate(train()()): + pass + for test_count, _ in enumerate(test()()): + pass + + print train_count, test_count + + +def fetch(): + paddle.v2.dataset.common.download(URL, "movielens", MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") + + +if __name__ == '__main__': + unittest() diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py new file mode 100644 index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1 --- /dev/null +++ b/python/paddle/v2/dataset/mq2007.py @@ -0,0 +1,333 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +MQ2007 dataset + +MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross +validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set, +validation set and testing set. + +MQ2007 dataset from website +http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators + +""" + +import os +import functools +import rarfile +from common import download +import numpy as np + +# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar" +URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar" +MD5 = "7be1640ae95c6408dab0ae7207bdc706" + + +def __initialize_meta_info__(): + """ + download and extract the MQ2007 dataset + """ + fn = fetch() + rar = rarfile.RarFile(fn) + dirpath = os.path.dirname(fn) + rar.extractall(path=dirpath) + return dirpath + + +class Query(object): + """ + queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors + + Parameters: + ---------- + query_id : int + query_id in dataset, mapping from query to relevance documents + relevance_score : int + relevance score of query and document pair + feature_vector : array, dense feature + feature in vector format + description : string + comment section in query doc pair data + """ + + def __init__(self, + query_id=-1, + relevance_score=-1, + feature_vector=None, + description=""): + self.query_id = query_id + self.relevance_score = relevance_score + if feature_vector is None: + self.feature_vector = [] + else: + self.feature_vector = feature_vector + self.description = description + + def __str__(self): + string = "%s %s %s" % (str(self.relevance_score), str(self.query_id), + " ".join(str(f) for f in self.feature_vector)) + return string + + # @classmethod + def _parse_(self, text): + """ + parse line into Query + """ + comment_position = text.find('#') + line = text[:comment_position].strip() + self.description = text[comment_position + 1:].strip() + parts = line.split() + if len(parts) != 48: + sys.stdout.write("expect 48 space split parts, get %d" % + (len(parts))) + return None + # format : 0 qid:10 1:0.000272 2:0.000000 .... + self.relevance_score = int(parts[0]) + self.query_id = int(parts[1].split(':')[1]) + for p in parts[2:]: + pair = p.split(':') + self.feature_vector.append(float(pair[1])) + return self + + +class QueryList(object): + """ + group query into list, every item in list is a Query + """ + + def __init__(self, querylist=None): + self.query_id = -1 + if querylist is None: + self.querylist = [] + else: + self.querylist = querylist + for query in self.querylist: + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + + def __iter__(self): + for query in self.querylist: + yield query + + def __len__(self): + return len(self.querylist) + + def __getitem__(self, i): + return self.querylist[i] + + def _correct_ranking_(self): + if self.querylist is None: + return + self.querylist.sort(key=lambda x: x.relevance_score, reverse=True) + + def _add_query(self, query): + if self.query_id == -1: + self.query_id = query.query_id + else: + if self.query_id != query.query_id: + raise ValueError("query in list must be same query_id") + self.querylist.append(query) + + +def gen_plain_txt(querylist): + """ + gen plain text in list for other usage + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + query_id : np.array, shape=(samples_num, ) + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield querylist.query_id, query.relevance_score, np.array( + query.feature_vector) + + +def gen_point(querylist): + """ + gen item in list for point-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + for query in querylist: + yield query.relevance_score, np.array(query.feature_vector) + + +def gen_pair(querylist, partial_order="full"): + """ + gen pair for pair-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + pairtial_order : "full" or "neighbour" + there is redudant in all possiable pair combinations, which can be simplifed + gen pairs for neighbour items or the full partial order pairs + + return : + ------ + label : np.array, shape=(1) + query_left : np.array, shape=(1, feature_dimension) + query_right : same as left + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + labels = [] + docpairs = [] + + # C(n,2) + for i in range(len(querylist)): + query_left = querylist[i] + for j in range(i + 1, len(querylist)): + query_right = querylist[j] + if query_left.relevance_score > query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_left.feature_vector), + np.array(query_right.feature_vector) + ]) + elif query_left.relevance_score < query_right.relevance_score: + labels.append([1]) + docpairs.append([ + np.array(query_right.feature_vector), + np.array(query_left.feature_vector) + ]) + for label, pair in zip(labels, docpairs): + yield np.array(label), pair[0], pair[1] + + +def gen_list(querylist): + """ + gen item in list for list-wise learning to rank algorithm + Paramters: + -------- + querylist : querylist, one query match many docment pairs in list, see QueryList + + return : + ------ + label : np.array, shape=(samples_num, ) + querylist : np.array, shape=(samples_num, feature_dimension) + """ + if not isinstance(querylist, QueryList): + querylist = QueryList(querylist) + querylist._correct_ranking_() + relevance_score_list = [[query.relevance_score] for query in querylist] + feature_vector_list = [query.feature_vector for query in querylist] + yield np.array(relevance_score_list), np.array(feature_vector_list) + + +def query_filter(querylists): + """ + filter query get only document with label 0. + label 0, 1, 2 means the relevance score document with query + parameters : + querylist : QueyList list + + return : + querylist : QueyList list + """ + filter_query = [] + for querylist in querylists: + relevance_score_list = [query.relevance_score for query in querylist] + if sum(relevance_score_list) != .0: + filter_query.append(querylist) + return filter_query + + +def load_from_text(filepath, shuffle=False, fill_missing=-1): + """ + parse data file into querys + """ + prev_query_id = -1 + querylists = [] + querylist = None + fn = __initialize_meta_info__() + with open(os.path.join(fn, filepath)) as f: + for line in f: + query = Query() + query = query._parse_(line) + if query == None: + continue + if query.query_id != prev_query_id: + if querylist is not None: + querylists.append(querylist) + querylist = QueryList() + prev_query_id = query.query_id + querylist._add_query(query) + if querylist is not None: + querylists.append(querylist) + return querylists + + +def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1): + """ + Parameters + -------- + filename : string + fill_missing : fill the missing value. default in MQ2007 is -1 + + Returns + ------ + yield + label query_left, query_right # format = "pairwise" + label querylist # format = "listwise" + """ + querylists = query_filter( + load_from_text( + filepath, shuffle=shuffle, fill_missing=fill_missing)) + for querylist in querylists: + if format == "plain_txt": + yield next(gen_plain_txt(querylist)) + elif format == "pointwise": + yield next(gen_point(querylist)) + elif format == "pairwise": + for pair in gen_pair(querylist): + yield pair + elif format == "listwise": + yield next(gen_list(querylist)) + + +train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt") +test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt") + + +def fetch(): + return download(URL, "MQ2007", MD5) + + +if __name__ == "__main__": + fetch() + mytest = functools.partial( + __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise") + for label, query in mytest(): + print label, query diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b9757c1a75d215cf8945b5cedbb1239fd43af7 --- /dev/null +++ b/python/paddle/v2/dataset/sentiment.py @@ -0,0 +1,141 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The script fetch and preprocess movie_reviews data set that provided by NLTK + +TODO(yuyang18): Complete dataset. +""" + +import collections +from itertools import chain + +import nltk +from nltk.corpus import movie_reviews + +import paddle.v2.dataset.common + +__all__ = ['train', 'test', 'get_word_dict', 'convert'] +NUM_TRAINING_INSTANCES = 1600 +NUM_TOTAL_INSTANCES = 2000 + + +def download_data_if_not_yet(): + """ + Download the data set, if the data set is not download. + """ + try: + # make sure that nltk can find the data + if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) + movie_reviews.categories() + except LookupError: + print "Downloading movie_reviews data set, please wait....." + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + print "Download data set success....." + print "Path is " + nltk.data.find('corpora/movie_reviews').path + + +def get_word_dict(): + """ + Sorted the words by the frequency of words which occur in sample + :return: + words_freq_sorted + """ + words_freq_sorted = list() + word_freq_dict = collections.defaultdict(int) + download_data_if_not_yet() + + for category in movie_reviews.categories(): + for field in movie_reviews.fileids(category): + for words in movie_reviews.words(field): + word_freq_dict[words] += 1 + words_sort_list = word_freq_dict.items() + words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) + for index, word in enumerate(words_sort_list): + words_freq_sorted.append((word[0], index)) + return words_freq_sorted + + +def sort_files(): + """ + Sorted the sample for cross reading the sample + :return: + files_list + """ + files_list = list() + neg_file_list = movie_reviews.fileids('neg') + pos_file_list = movie_reviews.fileids('pos') + files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) + return files_list + + +def load_sentiment_data(): + """ + Load the data set + :return: + data_set + """ + data_set = list() + download_data_if_not_yet() + words_ids = dict(get_word_dict()) + for sample_file in sort_files(): + words_list = list() + category = 0 if 'neg' in sample_file else 1 + for word in movie_reviews.words(sample_file): + words_list.append(words_ids[word.lower()]) + data_set.append((words_list, category)) + return data_set + + +def reader_creator(data): + """ + Reader creator, generate an iterator for data set + :param data: + train data set or test data set + """ + for each in data: + yield each[0], each[1] + + +def train(): + """ + Default training set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) + + +def test(): + """ + Default test set reader creator + """ + data_set = load_sentiment_data() + return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) + + +def fetch(): + nltk.download( + 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9 --- /dev/null +++ b/python/paddle/v2/dataset/tests/cifar_test.py @@ -0,0 +1,56 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.cifar +import unittest + + +class TestCIFAR(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3072) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_test10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test10()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + def test_train10(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train10()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 9) + + def test_test100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.test100()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 99) + + def test_train100(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.cifar.train100()) + self.assertEqual(instances, 50000) + self.assertEqual(max_label_value, 99) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cfa194eba38ea70311c4deeac2635dc0a0103576 --- /dev/null +++ b/python/paddle/v2/dataset/tests/common_test.py @@ -0,0 +1,94 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.common +import unittest +import tempfile +import glob + + +class TestCommon(unittest.TestCase): + def test_md5file(self): + _, temp_path = tempfile.mkstemp() + with open(temp_path, 'w') as f: + f.write("Hello\n") + self.assertEqual('09f7e02f1290be211da707a266f153b3', + paddle.v2.dataset.common.md5file(temp_path)) + + def test_download(self): + yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' + self.assertEqual( + paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', + paddle.v2.dataset.common.download( + yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) + + def test_split(self): + def test_reader(): + def reader(): + for x in xrange(10): + yield x + + return reader + + _, temp_path = tempfile.mkstemp() + paddle.v2.dataset.common.split( + test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') + files = glob.glob(temp_path + '/test-%05d.pickle') + self.assertEqual(len(files), 3) + + def test_cluster_file_reader(self): + _, temp_path = tempfile.mkstemp() + for x in xrange(5): + with open(temp_path + '/%05d.test' % x) as f: + f.write('%d\n' % x) + reader = paddle.v2.dataset.common.cluster_files_reader( + temp_path + '/*.test', 5, 0) + for idx, e in enumerate(reader()): + self.assertEqual(e, str("0")) + + def test_convert(self): + record_num = 10 + num_shards = 4 + + def test_reader(): + def reader(): + for x in xrange(record_num): + yield x + + return reader + + path = tempfile.mkdtemp() + paddle.v2.dataset.common.convert(path, + test_reader(), num_shards, + 'random_images') + + files = glob.glob(path + '/random_images-*') + self.assertEqual(len(files), num_shards) + + recs = [] + for i in range(0, num_shards): + n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1) + r = recordio.reader(n) + while True: + d = r.read() + if d is None: + break + recs.append(d) + + recs.sort() + self.assertEqual(total, record_num) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233 --- /dev/null +++ b/python/paddle/v2/dataset/tests/flowers_test.py @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.flowers +import unittest + + +class TestFlowers(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + size = 224 * 224 * 3 + for l in reader(): + self.assertEqual(l[0].size, size) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.train()) + self.assertEqual(instances, 6149) + self.assertEqual(max_label_value, 102) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.test()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + def test_valid(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.flowers.valid()) + self.assertEqual(instances, 1020) + self.assertEqual(max_label_value, 102) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8 --- /dev/null +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -0,0 +1,57 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imdb +import unittest +import re + +TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$") +TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$") +TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$") + +TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$") +TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$") +TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$") + + +class TestIMDB(unittest.TestCase): + word_idx = None + + def test_build_dict(self): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + self.assertEqual(len(self.word_idx), 7036) + + def check_dataset(self, dataset, expected_size): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + sum = 0 + for l in dataset(self.word_idx): + self.assertEqual(l[1], sum % 2) + sum += 1 + self.assertEqual(sum, expected_size) + + def test_train(self): + self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + + def test_test(self): + self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py new file mode 100644 index 0000000000000000000000000000000000000000..714a75d6f1ff31697eec2d893d350a726d6390fe --- /dev/null +++ b/python/paddle/v2/dataset/tests/imikolov_test.py @@ -0,0 +1,67 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.imikolov +import unittest + +WORD_DICT = paddle.v2.dataset.imikolov.build_dict() + + +class TestMikolov(unittest.TestCase): + def check_reader(self, reader, n): + for l in reader(): + self.assertEqual(len(l), n) + + def test_train(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) + + first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ + 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ + 'rake regatta rubens sim snack-food ssangyong swapo wachter' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.train( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_test(self): + n = 5 + self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + + first_line = 'consumers may want to move their telephones a little '\ + 'closer to the tv set' + first_line = [ + WORD_DICT.get(ch, WORD_DICT['']) + for ch in first_line.split(' ') + ] + for l in paddle.v2.dataset.imikolov.test( + WORD_DICT, n=-1, + data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + read_line = l[0][1:] + break + self.assertEqual(first_line, read_line) + + def test_total(self): + _, idx = zip(*WORD_DICT.items()) + self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55 --- /dev/null +++ b/python/paddle/v2/dataset/tests/mnist_test.py @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mnist +import unittest + + +class TestMNIST(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 784) + if l[1] > label: + label = l[1] + sum += 1 + return sum, label + + def test_train(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.train()) + self.assertEqual(instances, 60000) + self.assertEqual(max_label_value, 9) + + def test_test(self): + instances, max_label_value = self.check_reader( + paddle.v2.dataset.mnist.test()) + self.assertEqual(instances, 10000) + self.assertEqual(max_label_value, 9) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py new file mode 100644 index 0000000000000000000000000000000000000000..59847b6c18eadb12123cae824e8bce1051a69d4c --- /dev/null +++ b/python/paddle/v2/dataset/tests/mq2007_test.py @@ -0,0 +1,33 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.mq2007 +import unittest + + +class TestMQ2007(unittest.TestCase): + def test_pairwise(self): + for label, query_left, query_right in paddle.v2.dataset.mq2007.test( + format="pairwise"): + self.assertEqual(query_left.shape(), (46, )) + self.assertEqual(query_right.shape(), (46, )) + + def test_listwise(self): + for label_array, query_array in paddle.v2.dataset.mq2007.test( + format="listwise"): + self.assertEqual(len(label_array), len(query_array)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6 --- /dev/null +++ b/python/paddle/v2/dataset/tests/test_sentiment.py @@ -0,0 +1,55 @@ +# /usr/bin/env python +# -*- coding:utf-8 -*- + +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import nltk +import paddle.v2.dataset.sentiment as st +from nltk.corpus import movie_reviews + + +class TestSentimentMethods(unittest.TestCase): + def test_get_word_dict(self): + word_dict = st.get_word_dict()[0:10] + test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3), + (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7), + (u'is', 8), (u'in', 9)] + for idx, each in enumerate(word_dict): + self.assertEqual(each, test_word_list[idx]) + self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path) + + def test_sort_files(self): + last_label = '' + for sample_file in st.sort_files(): + current_label = sample_file.split("/")[0] + self.assertNotEqual(current_label, last_label) + last_label = current_label + + def test_data_set(self): + data_set = st.load_sentiment_data() + last_label = -1 + for each in st.test(): + self.assertNotEqual(each[1], last_label) + last_label = each[1] + self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES) + self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES) + self.assertEqual( + len(list(st.test())), + (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py new file mode 100644 index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353 --- /dev/null +++ b/python/paddle/v2/dataset/tests/voc2012_test.py @@ -0,0 +1,42 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.voc2012 +import unittest + + +class TestVOC(unittest.TestCase): + def check_reader(self, reader): + sum = 0 + label = 0 + for l in reader(): + self.assertEqual(l[0].size, 3 * l[1].size) + sum += 1 + return sum + + def test_train(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.train()) + self.assertEqual(count, 2913) + + def test_test(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.test()) + self.assertEqual(count, 1464) + + def test_val(self): + count = self.check_reader(paddle.v2.dataset.voc_seg.val()) + self.assertEqual(count, 1449) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cef6c3216e7de8d9785a063976e63f88d90b24df --- /dev/null +++ b/python/paddle/v2/dataset/tests/wmt16_test.py @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2.dataset.wmt16 +import unittest + + +class TestWMT16(unittest.TestCase): + def checkout_one_sample(self, sample): + # train data has 3 field: source language word indices, + # target language word indices, and target next word indices. + self.assertEqual(len(sample), 3) + + # test start mark and end mark in source word indices. + self.assertEqual(sample[0][0], 0) + self.assertEqual(sample[0][-1], 1) + + # test start mask in target word indices + self.assertEqual(sample[1][0], 0) + + # test en mask in target next word indices + self.assertEqual(sample[2][-1], 1) + + def test_train(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.train( + src_dict_size=100000, trg_dict_size=100000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_test(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.test( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_val(self): + for idx, sample in enumerate( + paddle.v2.dataset.wmt16.validation( + src_dict_size=1000, trg_dict_size=1000)()): + if idx >= 10: break + self.checkout_one_sample(sample) + + def test_get_dict(self): + dict_size = 1000 + word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) + self.assertEqual(len(word_dict), dict_size) + self.assertEqual(word_dict[0], "") + self.assertEqual(word_dict[1], "") + self.assertEqual(word_dict[2], "") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py new file mode 100644 index 0000000000000000000000000000000000000000..f10bf7e42a1ead09b3eba0d61e55701215e4360f --- /dev/null +++ b/python/paddle/v2/dataset/uci_housing.py @@ -0,0 +1,134 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +UCI Housing dataset. + +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse training set and test set into paddle reader creators. +""" + +import numpy as np +import os +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = ['train', 'test'] + +URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' +MD5 = 'd4accdce7a25600298819f8e28e8d593' +feature_names = [ + 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', + 'PTRATIO', 'B', 'LSTAT', 'convert' +] + +UCI_TRAIN_DATA = None +UCI_TEST_DATA = None +URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar' +MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b' + + +def feature_range(maximums, minimums): + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + feature_num = len(maximums) + ax.bar(range(feature_num), maximums - minimums, color='r', align='center') + ax.set_title('feature scale') + plt.xticks(range(feature_num), feature_names) + plt.xlim([-1, feature_num]) + fig.set_figheight(6) + fig.set_figwidth(10) + if not os.path.exists('./image'): + os.makedirs('./image') + fig.savefig('image/ranges.png', dpi=48) + plt.close(fig) + + +def load_data(filename, feature_num=14, ratio=0.8): + global UCI_TRAIN_DATA, UCI_TEST_DATA + if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None: + return + + data = np.fromfile(filename, sep=' ') + data = data.reshape(data.shape[0] / feature_num, feature_num) + maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( + axis=0) / data.shape[0] + feature_range(maximums[:-1], minimums[:-1]) + for i in xrange(feature_num - 1): + data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) + offset = int(data.shape[0] * ratio) + UCI_TRAIN_DATA = data[:offset] + UCI_TEST_DATA = data[offset:] + + +def train(): + """ + UCI_HOUSING training set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Training reader creator + :rtype: callable + """ + global UCI_TRAIN_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TRAIN_DATA: + yield d[:-1], d[-1:] + + return reader + + +def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Test reader creator + :rtype: callable + """ + global UCI_TEST_DATA + load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + + def reader(): + for d in UCI_TEST_DATA: + yield d[:-1], d[-1:] + + return reader + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', + MD5_MODEL) + with open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + + +def convert(path): + """ + Converts dataset to recordio format + """ + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py new file mode 100644 index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77 --- /dev/null +++ b/python/paddle/v2/dataset/voc2012.py @@ -0,0 +1,85 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image dataset for segmentation. +The 2012 dataset contains images from 2008-2011 for which additional +segmentations have been prepared. As in previous years the assignment +to training/test sets has been maintained. The total number of images +with segmentation has been increased from 7,062 to 9,993. +""" + +import tarfile +import io +import numpy as np +from paddle.v2.dataset.common import download +from paddle.v2.image import * +from PIL import Image + +__all__ = ['train', 'test', 'val'] + +VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\ +VOCtrainval_11-May-2012.tar' + +VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd' +SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt' +DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg' +LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png' + +CACHE_DIR = 'voc2012' + + +def reader_creator(filename, sub_name): + + tarobject = tarfile.open(filename) + name2mem = {} + for ele in tarobject.getmembers(): + name2mem[ele.name] = ele + + def reader(): + set_file = SET_FILE.format(sub_name) + sets = tarobject.extractfile(name2mem[set_file]) + for line in sets: + line = line.strip() + data_file = DATA_FILE.format(line) + label_file = LABEL_FILE.format(line) + data = tarobject.extractfile(name2mem[data_file]).read() + label = tarobject.extractfile(name2mem[label_file]).read() + data = Image.open(io.BytesIO(data)) + label = Image.open(io.BytesIO(label)) + data = np.array(data) + label = np.array(label) + yield data, label + + return reader + + +def train(): + """ + Create a train dataset reader containing 2913 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval') + + +def test(): + """ + Create a test dataset reader containing 1464 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train') + + +def val(): + """ + Create a val dataset reader containing 1449 images in HWC order. + """ + return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val') diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py new file mode 100644 index 0000000000000000000000000000000000000000..5104e29051e4480f3a7eb18421f1b519841b009b --- /dev/null +++ b/python/paddle/v2/dataset/wmt14.py @@ -0,0 +1,182 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse training set and test set into paddle reader creators. + +""" +import tarfile +import gzip + +import paddle.v2.dataset.common +from paddle.v2.parameters import Parameters + +__all__ = [ + 'train', + 'test', + 'get_dict', + 'convert', +] + +URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' + 'cslm_joint_paper/data/dev+test.tgz') +MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' +# this is a small set of data for test. The original data is too large and +# will be add later. +URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' + 'wmt_shrinked_data/wmt14.tgz') +MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' +# BLEU of this trained model is 26.92 +URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' +MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' + +START = "" +END = "" +UNK = "" +UNK_IDX = 2 + + +def __read_to_dict(tar_file, dict_size): + def __to_dict(fd, size): + out_dict = dict() + for line_count, line in enumerate(fd): + if line_count < size: + out_dict[line.strip()] = line_count + else: + break + return out_dict + + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith("src.dict") + ] + assert len(names) == 1 + src_dict = __to_dict(f.extractfile(names[0]), dict_size) + names = [ + each_item.name for each_item in f + if each_item.name.endswith("trg.dict") + ] + assert len(names) == 1 + trg_dict = __to_dict(f.extractfile(names[0]), dict_size) + return src_dict, trg_dict + + +def reader_creator(tar_file, file_name, dict_size): + def reader(): + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + with tarfile.open(tar_file, mode='r') as f: + names = [ + each_item.name for each_item in f + if each_item.name.endswith(file_name) + ] + for name in names: + for line in f.extractfile(name): + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src_seq = line_split[0] # one source sequence + src_words = src_seq.split() + src_ids = [ + src_dict.get(w, UNK_IDX) + for w in [START] + src_words + [END] + ] + + trg_seq = line_split[1] # one target sequence + trg_words = trg_seq.split() + trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words] + + # remove sequence whose length > 80 in training mode + if len(src_ids) > 80 or len(trg_ids) > 80: + continue + trg_ids_next = trg_ids + [trg_dict[END]] + trg_ids = [trg_dict[START]] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(dict_size): + """ + WMT14 training set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'train/train', dict_size) + + +def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'test/test', dict_size) + + +def gen(dict_size): + return reader_creator( + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + 'gen/gen', dict_size) + + +def model(): + tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + with gzip.open(tar_file, 'r') as f: + parameters = Parameters.from_tar(f) + return parameters + + +def get_dict(dict_size, reverse=True): + # if reverse = False, return dict = {'a':'001', 'b':'002', ...} + # else reverse = true, return dict = {'001':'a', '002':'b', ...} + tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + src_dict, trg_dict = __read_to_dict(tar_file, dict_size) + if reverse: + src_dict = {v: k for k, v in src_dict.items()} + trg_dict = {v: k for k, v in trg_dict.items()} + return src_dict, trg_dict + + +def fetch(): + paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + + +def convert(path): + """ + Converts dataset to recordio format + """ + dict_size = 30000 + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py new file mode 100644 index 0000000000000000000000000000000000000000..c8818f715beadd9499ae588f2c19a57fbf26f372 --- /dev/null +++ b/python/paddle/v2/dataset/wmt16.py @@ -0,0 +1,349 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ACL2016 Multimodal Machine Translation. Please see this website for more +details: http://www.statmt.org/wmt16/multimodal-task.html#task1 + +If you use the dataset created for your task, please cite the following paper: +Multi30K: Multilingual English-German Image Descriptions. + +@article{elliott-EtAl:2016:VL16, + author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.}, + title = {Multi30K: Multilingual English-German Image Descriptions}, + booktitle = {Proceedings of the 6th Workshop on Vision and Language}, + year = {2016}, + pages = {70--74}, + year = 2016 +} +""" + +import os +import tarfile +import gzip +from collections import defaultdict + +import paddle.v2.dataset.common + +__all__ = [ + "train", + "test", + "validation", + "convert", + "fetch", + "get_dict", +] + +DATA_URL = ("http://cloud.dlnel.org/filepub/" + "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed") +DATA_MD5 = "0c38be43600334966403524a40dcd81e" + +TOTAL_EN_WORDS = 11250 +TOTAL_DE_WORDS = 19220 + +START_MARK = "" +END_MARK = "" +UNK_MARK = "" + + +def __build_dict(tar_file, dict_size, save_path, lang): + word_dict = defaultdict(int) + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile("wmt16/train"): + line_split = line.strip().split("\t") + if len(line_split) != 2: continue + sen = line_split[0] if lang == "en" else line_split[1] + for w in sen.split(): + word_dict[w] += 1 + + with open(save_path, "w") as fout: + fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) + for idx, word in enumerate( + sorted( + word_dict.iteritems(), key=lambda x: x[1], reverse=True)): + if idx + 3 == dict_size: break + fout.write("%s\n" % (word[0])) + + +def __load_dict(tar_file, dict_size, lang, reverse=False): + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + if not os.path.exists(dict_path) or ( + len(open(dict_path, "r").readlines()) != dict_size): + __build_dict(tar_file, dict_size, dict_path, lang) + + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip() + else: + word_dict[line.strip()] = idx + return word_dict + + +def __get_dict_size(src_dict_size, trg_dict_size, src_lang): + src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else + TOTAL_DE_WORDS)) + trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else + TOTAL_ENG_WORDS)) + return src_dict_size, trg_dict_size + + +def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): + def reader(): + src_dict = __load_dict(tar_file, src_dict_size, src_lang) + trg_dict = __load_dict(tar_file, trg_dict_size, + ("de" if src_lang == "en" else "en")) + + # the indice for start mark, end mark, and unk are the same in source + # language and target language. Here uses the source language + # dictionary to determine their indices. + start_id = src_dict[START_MARK] + end_id = src_dict[END_MARK] + unk_id = src_dict[UNK_MARK] + + src_col = 0 if src_lang == "en" else 1 + trg_col = 1 - src_col + + with tarfile.open(tar_file, mode="r") as f: + for line in f.extractfile(file_name): + line_split = line.strip().split("\t") + if len(line_split) != 2: + continue + src_words = line_split[src_col].split() + src_ids = [start_id] + [ + src_dict.get(w, unk_id) for w in src_words + ] + [end_id] + + trg_words = line_split[trg_col].split() + trg_ids = [trg_dict.get(w, unk_id) for w in trg_words] + + trg_ids_next = trg_ids + [end_id] + trg_ids = [start_id] + trg_ids + + yield src_ids, trg_ids, trg_ids_next + + return reader + + +def train(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 train set reader. + + This function returns the reader for train data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + + NOTE: + The original like for training data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The train reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. Only support: " + "en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/train", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def test(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 test set reader. + + This function returns the reader for test data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for test data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The test reader. + """ + + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/test", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def validation(src_dict_size, trg_dict_size, src_lang="en"): + """ + WMT16 validation set reader. + + This function returns the reader for validation data. Each sample the reader + returns is made up of three fields: the source language word index sequence, + target language word index sequence and next word index sequence. + + NOTE: + The original like for validation data is: + http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz + + paddle.dataset.wmt16 provides a tokenized version of the original dataset by + using moses's tokenization script: + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl + + Args: + src_dict_size(int): Size of the source language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + trg_dict_size(int): Size of the target language dictionary. Three + special tokens will be added into the dictionary: + for start mark, for end mark, and for + unknown word. + src_lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + + Returns: + callable: The validation reader. + """ + if src_lang not in ["en", "de"]: + raise ValueError("An error language type. " + "Only support: en (for English); de(for Germany).") + src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size, + src_lang) + + return reader_creator( + tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), + file_name="wmt16/val", + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang) + + +def get_dict(lang, dict_size, reverse=False): + """ + return the word dictionary for the specified language. + + Args: + lang(string): A string indicating which language is the source + language. Available options are: "en" for English + and "de" for Germany. + dict_size(int): Size of the specified language dictionary. + reverse(bool): If reverse is set to False, the returned python + dictionary will use word as key and use index as value. + If reverse is set to True, the returned python + dictionary will use index as key and word as value. + + Returns: + dict: The word dictionary for the specific language. + """ + + if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) + else: dict_size = min(dict_size, TOTAL_DE_WORDS) + + dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + "wmt16/%s_%d.dict" % (lang, dict_size)) + assert os.path.exists(dict_path), "Word dictionary does not exist. " + "Please invoke paddle.dataset.wmt16.train/test/validation first " + "to build the dictionary." + tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") + return __load_dict(tar_file, dict_size, lang, reverse) + + +def fetch(): + """download the entire dataset. + """ + paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz") + + +def convert(path, src_dict_size, trg_dict_size, src_lang): + """Converts dataset to recordio format. + """ + + paddle.v2.dataset.common.convert( + path, + train( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_train") + paddle.v2.dataset.common.convert( + path, + test( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_test") + paddle.v2.dataset.common.convert( + path, + validation( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + src_lang=src_lang), + 1000, + "wmt16_validation") diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py new file mode 100644 index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981 --- /dev/null +++ b/python/paddle/v2/image.py @@ -0,0 +1,381 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains some common interfaces for image preprocess. +Many users are confused about the image layout. We introduce +the image layout as follows. + +- CHW Layout + + - The abbreviations: C=channel, H=Height, W=Width + - The default layout of image opened by cv2 or PIL is HWC. + PaddlePaddle only supports the CHW layout. And CHW is simply + a transpose of HWC. It must transpose the input image. + +- Color format: RGB or BGR + + OpenCV use BGR color format. PIL use RGB color format. Both + formats can be used for training. Noted that, the format should + be keep consistent between the training and inference peroid. +""" +import numpy as np +try: + import cv2 +except ImportError: + cv2 = None +import os +import tarfile +import cPickle + +__all__ = [ + "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop", + "random_crop", "left_right_flip", "simple_transform", "load_and_transform", + "batch_images_from_tar" +] + + +def batch_images_from_tar(data_file, + dataset_name, + img2label, + num_per_batch=1024): + """ + Read images from tar file and batch them into batch file. + + :param data_file: path of image tar file + :type data_file: string + :param dataset_name: 'train','test' or 'valid' + :type dataset_name: string + :param img2label: a dic with image file name as key + and image's label as value + :type img2label: dic + :param num_per_batch: image number per batch file + :type num_per_batch: int + :return: path of list file containing paths of batch file + :rtype: string + """ + batch_dir = data_file + "_batch" + out_path = "%s/%s" % (batch_dir, dataset_name) + meta_file = "%s/%s.txt" % (batch_dir, dataset_name) + + if os.path.exists(out_path): + return meta_file + else: + os.makedirs(out_path) + + tf = tarfile.open(data_file) + mems = tf.getmembers() + data = [] + labels = [] + file_id = 0 + for mem in mems: + if mem.name in img2label: + data.append(tf.extractfile(mem).read()) + labels.append(img2label[mem.name]) + if len(data) == num_per_batch: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + file_id += 1 + data = [] + labels = [] + if len(data) > 0: + output = {} + output['label'] = labels + output['data'] = data + cPickle.dump( + output, + open('%s/batch_%d' % (out_path, file_id), 'w'), + protocol=cPickle.HIGHEST_PROTOCOL) + + with open(meta_file, 'a') as meta: + for file in os.listdir(out_path): + meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n") + return meta_file + + +def load_image_bytes(bytes, is_color=True): + """ + Load an color or gray image from bytes array. + + Example usage: + + .. code-block:: python + + with open('cat.jpg') as f: + im = load_image_bytes(f.read()) + + :param bytes: the input image bytes array. + :type bytes: str + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + flag = 1 if is_color else 0 + file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) + img = cv2.imdecode(file_bytes, flag) + return img + + +def load_image(file, is_color=True): + """ + Load an color or gray image from the file path. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + + :param file: the input image path. + :type file: string + :param is_color: If set is_color True, it will load and + return a color image. Otherwise, it will + load and return a gray image. + :type is_color: bool + """ + # cv2.IMAGE_COLOR for OpenCV3 + # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version + # cv2.IMAGE_GRAYSCALE for OpenCV3 + # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version + # Here, use constant 1 and 0 + # 1: COLOR, 0: GRAYSCALE + flag = 1 if is_color else 0 + im = cv2.imread(file, flag) + return im + + +def resize_short(im, size): + """ + Resize an image so that the length of shorter edge is size. + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the shorter edge size of image after resizing. + :type size: int + """ + h, w = im.shape[:2] + h_new, w_new = size, size + if h > w: + h_new = size * h / w + else: + w_new = size * w / h + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + return im + + +def to_chw(im, order=(2, 0, 1)): + """ + Transpose the input image order. The image layout is HWC format + opened by cv2 or PIL. Transpose the input image to CHW layout + according the order (2,0,1). + + Example usage: + + .. code-block:: python + + im = load_image('cat.jpg') + im = resize_short(im, 256) + im = to_chw(im) + + :param im: the input image with HWC layout. + :type im: ndarray + :param order: the transposed order. + :type order: tuple|list + """ + assert len(im.shape) == len(order) + im = im.transpose(order) + return im + + +def center_crop(im, size, is_color=True): + """ + Crop the center of image with size. + + Example usage: + + .. code-block:: python + + im = center_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = (h - size) / 2 + w_start = (w - size) / 2 + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def random_crop(im, size, is_color=True): + """ + Randomly crop input image with size. + + Example usage: + + .. code-block:: python + + im = random_crop(im, 224) + + :param im: the input image with HWC layout. + :type im: ndarray + :param size: the cropping size. + :type size: int + :param is_color: whether the image is color or not. + :type is_color: bool + """ + h, w = im.shape[:2] + h_start = np.random.randint(0, h - size + 1) + w_start = np.random.randint(0, w - size + 1) + h_end, w_end = h_start + size, w_start + size + if is_color: + im = im[h_start:h_end, w_start:w_end, :] + else: + im = im[h_start:h_end, w_start:w_end] + return im + + +def left_right_flip(im, is_color=True): + """ + Flip an image along the horizontal direction. + Return the flipped image. + + Example usage: + + .. code-block:: python + + im = left_right_flip(im) + + :param im: input image with HWC layout or HW layout for gray image + :type im: ndarray + :param is_color: whether input image is color or not + :type is_color: bool + """ + if len(im.shape) == 3 and is_color: + return im[:, ::-1, :] + else: + return im[:, ::-1] + + +def simple_transform(im, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Simply data argumentation for training. These operations include + resizing, croping and flipping. + + Example usage: + + .. code-block:: python + + im = simple_transform(im, 256, 224, True) + + :param im: The input image with HWC layout. + :type im: ndarray + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = resize_short(im, resize_size) + if is_train: + im = random_crop(im, crop_size, is_color=is_color) + if np.random.randint(2) == 0: + im = left_right_flip(im, is_color) + else: + im = center_crop(im, crop_size, is_color) + im = center_crop(im, crop_size, is_color=is_color) + if len(im.shape) == 3: + im = to_chw(im) + + im = im.astype('float32') + if mean is not None: + mean = np.array(mean, dtype=np.float32) + # mean value, may be one value per channel + if mean.ndim == 1 and is_color: + mean = mean[:, np.newaxis, np.newaxis] + elif mean.ndim == 1: + mean = mean + else: + # elementwise mean + assert len(mean.shape) == len(im) + im -= mean + + return im + + +def load_and_transform(filename, + resize_size, + crop_size, + is_train, + is_color=True, + mean=None): + """ + Load image from the input file `filename` and transform image for + data argumentation. Please refer to the `simple_transform` interface + for the transform operations. + + Example usage: + + .. code-block:: python + + im = load_and_transform('cat.jpg', 256, 224, True) + + :param filename: The file name of input image. + :type filename: string + :param resize_size: The shorter edge length of the resized image. + :type resize_size: int + :param crop_size: The cropping size. + :type crop_size: int + :param is_train: Whether it is training or not. + :type is_train: bool + :param is_color: whether the image is color or not. + :type is_color: bool + :param mean: the mean values, which can be element-wise mean values or + mean values per channel. + :type mean: numpy array | list + """ + im = load_image(filename, is_color) + im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) + return im diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py new file mode 100644 index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb --- /dev/null +++ b/python/paddle/v2/minibatch.py @@ -0,0 +1,41 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['batch'] + + +def batch(reader, batch_size): + """ + Create a batched reader. + + :param reader: the data reader to read from. + :type reader: callable + :param batch_size: size of each mini-batch + :type batch_size: int + :return: the batched reader. + :rtype: callable + """ + + def batch_reader(): + r = reader() + b = [] + for instance in r: + b.append(instance) + if len(b) == batch_size: + yield b + b = [] + if b: + yield b + + return batch_reader diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6 --- /dev/null +++ b/python/paddle/v2/reader/__init__.py @@ -0,0 +1,74 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +At training and testing time, PaddlePaddle programs need to read data. To ease +the users' work to write data reading code, we define that + +- A *reader* is a function that reads data (from file, network, random number + generator, etc) and yields data items. +- A *reader creator* is a function that returns a reader function. +- A *reader decorator* is a function, which accepts one or more readers, and + returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, + random number generator, etc) and yields a batch of data items. + +##################### +Data Reader Interface +##################### + +Indeed, *data reader* doesn't have to be a function that reads and yields data +items. It can be any function with no parameter that creates a iterable +(anything can be used in :code:`for x in iterable`)\: + +.. code-block:: python + + iterable = data_reader() + +Element produced from the iterable should be a **single** entry of data, +**not** a mini batch. That entry of data could be a single item, or a tuple of +items. +Item should be of `supported type `_ (e.g., numpy 1d +array of float32, int, list of int) + +An example implementation for single item data reader creator: + +.. code-block:: python + + def reader_creator_random_image(width, height): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader + +An example implementation for multiple item data reader creator: + +.. code-block:: python + + def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader + + +TODO(yuyang18): Should we add whole design doc here? +""" + +import decorator +from decorator import * + +import creator + +__all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py new file mode 100644 index 0000000000000000000000000000000000000000..fda5246d74f598200b439774a25e80ec3e504077 --- /dev/null +++ b/python/paddle/v2/reader/creator.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Creator package contains some simple reader creator, which could +be used in user program. +""" + +__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] + + +def np_array(x): + """ + Creates a reader that yields elements of x, if it is a + numpy vector. Or rows of x, if it is a numpy matrix. + Or any sub-hyperplane indexed by the highest dimension. + + :param x: the numpy array to create reader from. + :returns: data reader created from x. + """ + + def reader(): + if x.ndim < 1: + yield x + + for e in x: + yield e + + return reader + + +def text_file(path): + """ + Creates a data reader that outputs text line by line from given text file. + Trailing new line ('\\\\n') of each line will be removed. + + :path: path of the text file. + :returns: data reader of text file + """ + + def reader(): + f = open(path, "r") + for l in f: + yield l.rstrip('\n') + f.close() + + return reader + + +def recordio(paths, buf_size=100): + """ + Creates a data reader from given RecordIO file paths separated by ",", + glob pattern is supported. + :path: path of recordio files, can be a string or a string list. + :returns: data reader of recordio files. + """ + + import recordio as rec + import paddle.v2.reader.decorator as dec + import cPickle as pickle + + def reader(): + if isinstance(paths, basestring): + path = paths + else: + path = ",".join(paths) + f = rec.reader(path) + while True: + r = f.read() + if r is None: + break + yield pickle.loads(r) + f.close() + + return dec.buffered(reader, buf_size) + + +pass_num = 0 + + +def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): + """ + Create a data reader that yield a record one by one from + the paths: + :paths: path of recordio files, can be a string or a string list. + :etcd_endpoints: the endpoints for etcd cluster + :returns: data reader of recordio files. + + .. code-block:: python + from paddle.v2.reader.creator import cloud_reader + etcd_endpoints = "http://127.0.0.1:2379" + trainer.train.( + reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), + ) + """ + import os + import cPickle as pickle + import paddle.v2.master as master + c = master.client(etcd_endpoints, timeout_sec, buf_size) + + if isinstance(paths, basestring): + path = [paths] + else: + path = paths + c.set_dataset(path) + + def reader(): + global pass_num + c.paddle_start_get_records(pass_num) + pass_num += 1 + + while True: + r, e = c.next_record() + if not r: + if e != -2: + print "get record error: ", e + break + yield pickle.loads(r) + + return reader diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py new file mode 100644 index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef --- /dev/null +++ b/python/paddle/v2/reader/decorator.py @@ -0,0 +1,405 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' +] + +from threading import Thread +import subprocess + +from Queue import Queue +import itertools +import random +import zlib + + +def map_readers(func, *readers): + """ + Creates a data reader that outputs return value of function using + output of each data readers as arguments. + + :param func: function to use. The type of func should be (Sample) => Sample + :type: callable + :param readers: readers whose outputs will be used as arguments of func. + :return: the created data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + for e in itertools.imap(func, *rs): + yield e + + return reader + + +def shuffle(reader, buf_size): + """ + Creates a data reader whose data output is shuffled. + + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + :param reader: the original reader whose output will be shuffled. + :type reader: callable + :param buf_size: shuffle buffer size. + :type buf_size: int + + :return: the new reader whose output is shuffled. + :rtype: callable + """ + + def data_reader(): + buf = [] + for e in reader(): + buf.append(e) + if len(buf) >= buf_size: + random.shuffle(buf) + for b in buf: + yield b + buf = [] + + if len(buf) > 0: + random.shuffle(buf) + for b in buf: + yield b + + return data_reader + + +def chain(*readers): + """ + Creates a data reader whose output is the outputs of input data + readers chained together. + + If input readers output following data entries: + [0, 0, 0] + [1, 1, 1] + [2, 2, 2] + The chained reader will output: + [0, 0, 0, 1, 1, 1, 2, 2, 2] + + :param readers: input readers. + :return: the new data reader. + :rtype: callable + """ + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + + for e in itertools.chain(*rs): + yield e + + return reader + + +class ComposeNotAligned(ValueError): + pass + + +def compose(*readers, **kwargs): + """ + Creates a data reader whose output is the combination of input readers. + + If input readers output following data entries: + (1, 2) 3 (4, 5) + The composed reader will output: + (1, 2, 3, 4, 5) + + :param readers: readers that will be composed together. + :param check_alignment: if True, will check if input readers are aligned + correctly. If False, will not check alignment and trailing outputs + will be discarded. Defaults to True. + :type check_alignment: bool + + :return: the new data reader. + + :raises ComposeNotAligned: outputs of readers are not aligned. + Will not raise when check_alignment is set to False. + """ + check_alignment = kwargs.pop('check_alignment', True) + + def make_tuple(x): + if isinstance(x, tuple): + return x + else: + return (x, ) + + def reader(): + rs = [] + for r in readers: + rs.append(r()) + if not check_alignment: + for outputs in itertools.izip(*rs): + yield sum(map(make_tuple, outputs), ()) + else: + for outputs in itertools.izip_longest(*rs): + for o in outputs: + if o is None: + # None will be not be present if compose is aligned + raise ComposeNotAligned( + "outputs of readers are not aligned.") + yield sum(map(make_tuple, outputs), ()) + + return reader + + +def buffered(reader, size): + """ + Creates a buffered data reader. + + The buffered data reader will read and save data entries into a + buffer. Reading from the buffered data reader will proceed as long + as the buffer is not empty. + + :param reader: the data reader to read from. + :type reader: callable + :param size: max buffer size. + :type size: int + + :returns: the buffered data reader. + """ + + class EndSignal(): + pass + + end = EndSignal() + + def read_worker(r, q): + for d in r: + q.put(d) + q.put(end) + + def data_reader(): + r = reader() + q = Queue(maxsize=size) + t = Thread( + target=read_worker, args=( + r, + q, )) + t.daemon = True + t.start() + e = q.get() + while e != end: + yield e + e = q.get() + + return data_reader + + +def firstn(reader, n): + """ + Limit the max number of samples that reader could return. + + :param reader: the data reader to read from. + :type reader: callable + :param n: the max number of samples that return. + :type n: int + :return: the decorated reader. + :rtype: callable + """ + + # TODO(yuyang18): Check if just drop the reader, could clean the opened + # resource or not? + + def firstn_reader(): + for i, item in enumerate(reader()): + if i == n: + break + yield item + + return firstn_reader + + +class XmapEndSignal(): + pass + + +def xmap_readers(mapper, reader, process_num, buffer_size, order=False): + """ + Use multiprocess to map samples from reader by a mapper defined by user. + And this function contains a buffered decorator. + :param mapper: a function to map sample. + :type mapper: callable + :param reader: the data reader to read from + :type reader: callable + :param process_num: process number to handle original sample + :type process_num: int + :param buffer_size: max buffer size + :type buffer_size: int + :param order: keep the order of reader + :type order: bool + :return: the decarated reader + :rtype: callable + """ + end = XmapEndSignal() + + # define a worker to read samples from reader to in_queue + def read_worker(reader, in_queue): + for i in reader(): + in_queue.put(i) + in_queue.put(end) + + # define a worker to read samples from reader to in_queue with order flag + def order_read_worker(reader, in_queue): + in_order = 0 + for i in reader(): + in_queue.put((in_order, i)) + in_order += 1 + in_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue + def handle_worker(in_queue, out_queue, mapper): + sample = in_queue.get() + while not isinstance(sample, XmapEndSignal): + r = mapper(sample) + out_queue.put(r) + sample = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + # define a worker to handle samples from in_queue by mapper + # and put mapped samples into out_queue by order + def order_handle_worker(in_queue, out_queue, mapper, out_order): + ins = in_queue.get() + while not isinstance(ins, XmapEndSignal): + order, sample = ins + r = mapper(sample) + while order != out_order[0]: + pass + out_queue.put(r) + out_order[0] += 1 + ins = in_queue.get() + in_queue.put(end) + out_queue.put(end) + + def xreader(): + in_queue = Queue(buffer_size) + out_queue = Queue(buffer_size) + out_order = [0] + # start a read worker in a thread + target = order_read_worker if order else read_worker + t = Thread(target=target, args=(reader, in_queue)) + t.daemon = True + t.start() + # start several handle_workers + target = order_handle_worker if order else handle_worker + args = (in_queue, out_queue, mapper, out_order) if order else ( + in_queue, out_queue, mapper) + workers = [] + for i in xrange(process_num): + worker = Thread(target=target, args=args) + worker.daemon = True + workers.append(worker) + for w in workers: + w.start() + + sample = out_queue.get() + while not isinstance(sample, XmapEndSignal): + yield sample + sample = out_queue.get() + finish = 1 + while finish < process_num: + sample = out_queue.get() + if isinstance(sample, XmapEndSignal): + finish += 1 + else: + yield sample + + return xreader + + +def _buf2lines(buf, line_break="\n"): + # FIXME: line_break should be automatically configured. + lines = buf.split(line_break) + return lines[:-1], lines[-1] + + +class PipeReader: + """ + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. + + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: + + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" + + An example: + + .. code-block:: python + + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample + """ + + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ + remained = "" + while True: + buff = self.process.stdout.read(self.bufsize) + if buff: + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": + decomp_buff = buff + else: + raise TypeError("file_type %s is not allowed" % + self.file_type) + + if cut_lines: + lines, remained = _buf2lines(''.join( + [remained, decomp_buff]), line_break) + for line in lines: + yield line + else: + yield decomp_buff + else: + break diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63 --- /dev/null +++ b/python/paddle/v2/reader/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +py_test(creator_test SRCS creator_test.py) +py_test(decorator_test SRCS decorator_test.py) diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2 --- /dev/null +++ b/python/paddle/v2/reader/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe374e663607607cd0839eb6ca9c70c4d15eef8 --- /dev/null +++ b/python/paddle/v2/reader/tests/creator_test.py @@ -0,0 +1,74 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright PaddlePaddle contributors. All Rights Reservedd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import numpy as np +import paddle.v2.reader.creator + + +class TestNumpyArray(unittest.TestCase): + def test_numpy_array(self): + l = [[1, 2, 3], [4, 5, 6]] + x = np.array(l, np.int32) + reader = paddle.v2.reader.creator.np_array(x) + for idx, e in enumerate(reader()): + self.assertItemsEqual(e, l[idx]) + + +class TestTextFile(unittest.TestCase): + def test_text_file(self): + path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") + reader = paddle.v2.reader.creator.text_file(path) + for idx, e in enumerate(reader()): + self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) + + +class TestRecordIO(unittest.TestCase): + def do_test(self, path): + reader = paddle.v2.reader.creator.recordio(path) + idx = 0 + for e in reader(): + if idx == 0: + self.assertEqual(e, (1, 2, 3)) + elif idx == 1: + self.assertEqual(e, (4, 5, 6)) + idx += 1 + self.assertEqual(idx, 2) + + def test_recordIO(self): + self.do_test( + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat")) + self.do_test([ + os.path.join( + os.path.dirname(__file__), "test_reader_recordio.dat") + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6b680e39f3fb299a14e7d8162470996d1d16b83d --- /dev/null +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -0,0 +1,178 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import unittest + +import paddle.v2.reader + + +def reader_creator_10(dur): + def reader(): + for i in range(10): + # this invocation helps testing paddle.reader.buffer + time.sleep(dur) + yield i + + return reader + + +class TestMap(unittest.TestCase): + def test_map(self): + d = {"h": 0, "i": 1} + + def tokenize(x): + return d[x] + + def read(): + yield "h" + yield "i" + + r = paddle.v2.reader.map_readers(tokenize, read) + for i, e in enumerate(r()): + self.assertEqual(e, i) + + +class TestBuffered(unittest.TestCase): + def test_read(self): + for size in range(20): + b = paddle.v2.reader.buffered(reader_creator_10(0), size) + c = 0 + for i in b(): + self.assertEqual(i, c) + c += 1 + self.assertEqual(c, 10) + + def test_buffering(self): + # read have 30ms delay. + b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) + last_time = time.time() + for idx, i in enumerate(b()): + elapsed_time = time.time() - last_time + if i == 0: + time.sleep(0.3) + else: + # read time should be short, meaning already buffered. + self.assertLess(elapsed_time, 0.05) + last_time = time.time() + + +class TestCompose(unittest.TestCase): + def test_compse(self): + reader = paddle.v2.reader.compose( + reader_creator_10(0), reader_creator_10(0)) + for idx, e in enumerate(reader()): + self.assertEqual(e, (idx, idx)) + + def test_compose_not_aligned(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0)) + with self.assertRaises(paddle.v2.reader.ComposeNotAligned): + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + def test_compose_not_aligned_no_check(self): + total = 0 + reader = paddle.v2.reader.compose( + paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader_creator_10(0), + check_alignment=False) + for e in reader(): + total += 1 + # expecting 10, not 20 + self.assertEqual(total, 10) + + +class TestChain(unittest.TestCase): + def test_chain(self): + c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) + idx = 0 + for e in c(): + self.assertEqual(e, idx % 10) + idx += 1 + self.assertEqual(idx, 20) + + +class TestShuffle(unittest.TestCase): + def test_shuffle(self): + case = [(0, True), (1, True), (10, False), (100, False)] + a = reader_creator_10(0) + for size, checkEq in case: + s = paddle.v2.reader.shuffle(a, size) + total = 0 + for idx, e in enumerate(s()): + if checkEq: + self.assertEqual(idx, e) + total += 1 + self.assertEqual(total, 10) + + +class TestXmap(unittest.TestCase): + def test_xmap(self): + def mapper(x): + return (x + 1) + + orders = (True, False) + thread_nums = (1, 2, 4, 8, 16) + buffered_size = (1, 2, 4, 8, 16) + for order in orders: + for tNum in thread_nums: + for size in buffered_size: + reader = paddle.v2.reader.xmap_readers(mapper, + reader_creator_10(0), + tNum, size, order) + for n in xrange(3): + result = [] + for i in reader(): + result.append(i) + if not order: + result.sort() + for idx, e in enumerate(result): + self.assertEqual(e, mapper(idx)) + + +class TestPipeReader(unittest.TestCase): + def test_pipe_reader(self): + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l + + import tempfile + + records = [str(i) for i in xrange(5)] + temp = tempfile.NamedTemporaryFile() + try: + with open(temp.name, 'w') as f: + for r in records: + f.write('%s\n' % r) + + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) + finally: + # delete the temporary file + temp.close() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31 --- /dev/null +++ b/python/paddle/v2/reader/tests/test_data_creator.txt @@ -0,0 +1,3 @@ +0 1 +2 3 +4 5 diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat new file mode 100644 index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat new file mode 100644 index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69 Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index 46e4feb8e1ce1d12f214f5c49b1b589a46110603..b4333ed530ce464095ec38d72706949cc464fbe4 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,4 +1,5 @@ py_test(test_op SRCS test_op.py) +py_test(test_image SRCS test_image.py) py_test(test_layer SRCS test_layer.py) py_test(test_topology SRCS test_topology.py) py_test(test_rnn_layer SRCS test_rnn_layer.py) diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876 Binary files /dev/null and b/python/paddle/v2/tests/cat.jpg differ diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py new file mode 100644 index 0000000000000000000000000000000000000000..c78bbdc40a25878b21ba7e678afedf9d8f0a87cf --- /dev/null +++ b/python/paddle/v2/tests/test_image.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle.v2.image as image + + +class Image(unittest.TestCase): + def test_resize_flip_chw(self): + # resize + im = image.load_image('cat.jpg') + im = image.resize_short(im, 256) + self.assertEqual(256, min(im.shape[:2])) + self.assertEqual(3, im.shape[2]) + + # flip + im = image.left_right_flip(im) + im2 = np.flip(im, 1) + self.assertEqual(im.all(), im2.all()) + + # to_chw + h, w, c = im.shape + im = image.to_chw(im) + self.assertEqual(c, im.shape[0]) + self.assertEqual(h, im.shape[1]) + self.assertEqual(w, im.shape[2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py index 8320217da2795da756cf12a80f39279182789eef..264442be182ea69c95b39b3bdb4c389d52eff66e 100644 --- a/python/paddle/v2/tests/test_paramconf_order.py +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -27,7 +27,6 @@ # limitations under the License. import unittest import math -import paddle.dataset as dataset import paddle.v2 as paddle @@ -41,7 +40,7 @@ def wordemb(inlayer): def train(): - word_dict = dataset.imikolov.build_dict() + word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) # Every layer takes integer value of range [0, dict_size) firstword = paddle.layer.data( diff --git a/python/setup.py.in b/python/setup.py.in index d73a3a6a1c41b87efb9600ac59983bd16547ec6a..08a448934d3248b46618acdef9e1894f94a93893 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF': 'paddle.v2', 'paddle.v2.master', 'paddle.v2.plot', + 'paddle.v2.reader', + 'paddle.v2.dataset', 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: