diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7a5b6e940ec735314e8e47338b2c600..64816098a524f064ec12474a736cd4c721227a70 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
- Trainer Count: 100
- Metrics: mini-batch / sec
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+
+
+
+Batch Size |
+ 32 |
+64 |
+128 |
+256 |
+
+
+
+
+ PaddlePaddle Fluid |
+- |
+- |
+- |
+- |
+
+
+PaddlePaddle v2 |
+- |
+- |
+- |
+- |
+
+
+TensorFlow |
+- |
+- |
+- |
+- |
+
+
+
### Measure the Performance for Different PServer Count
@@ -48,11 +78,41 @@
- Batch Size: 64
- Metrics: mini-batch / sec
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+
+
+
+PServer Count |
+10 |
+20 |
+40 |
+60 |
+
+
+
+
+ PaddlePaddle Fluid |
+- |
+- |
+- |
+- |
+
+
+PaddlePaddle v2 |
+- |
+- |
+- |
+- |
+
+
+TensorFlow |
+- |
+- |
+- |
+- |
+
+
+
### Measure Parallel Efficiency By Increasing Trainer Count
@@ -67,11 +127,69 @@ The parallel efficiency is:
$E = \div(S, N)$
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+
+
+Trainer Counter |
+1 |
+10 |
+20 |
+30 |
+40 |
+50 |
+60 |
+70 |
+80 |
+90 |
+100 |
+
+
+
+
+ PaddlePaddle Fluid |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+
+
+PaddlePaddle v2 |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+
+
+TensorFlow |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+- |
+
+
+
+
## Reproduce the benchmark
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..d56a912b9b03986e32693363f82df05a34b779e9 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+
+
+
+Batch Size |
+ 32 |
+64 |
+128 |
+256 |
+
+
+
+
+ PaddlePaddle Fluid |
+ 15.44 |
+ 16.32 |
+ 16.74 |
+ 16.79 |
+
+
+PaddlePaddle v2 |
+ 15.97 |
+ 17.04 |
+ 17.60 |
+ 17.83 |
+
+
+TensorFlow |
+ 9.09 |
+ 9.10 |
+ 9.24 |
+ 8.66 |
+
+
+
+
### Different Batch Size
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Trainer Count: 20
- Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+
+
+
+Batch Size |
+ 32 |
+64 |
+128 |
+256 |
+
+
+
+
+ PaddlePaddle Fluid |
+ 190.20 |
+ 222.15 |
+ 247.40 |
+ 258.18 |
+
+
+PaddlePaddle v2 |
+ 170.96 |
+ 233.71 |
+ 256.14 |
+ 329.23 |
+
+
+TensorFlow |
+ - |
+ - |
+ - |
+ - |
+
+
+
### Accelerate Rate
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Batch Size: 128
- Metrics: samples / sec
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+
+
+
+Trainer Count |
+20 |
+40 |
+80 |
+100 |
+
+
+
+
+ PaddlePaddle Fluid |
+ 263.29 (78.64%) |
+ 518.80 (77.47%) |
+ 836.26 (62.44%) |
+ 1019.29 (60.89%) |
+
+
+PaddlePaddle v2 (need more tests) |
+ 326.85 (92.85%) |
+ 534.58 (75.93%) |
+ 853.30 (60.60%) |
+ 1041.99 (59.20%) |
+
+
+TensorFlow |
+ - |
+ - |
+ - |
+ - |
+
+
+
+
### Different Pserver Count
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Batch Size: 128
- Metrics: samples/ sec
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+
+
+
+PServer Count |
+3 |
+6 |
+10 |
+20 |
+
+
+
+
+ PaddlePaddle Fluid(should fix in next PR) |
+ 589.1 |
+ 592.6 |
+ 656.4 |
+ 655.8 |
+
+
+PaddlePaddle v2 (need more tests) |
+ 593.4 |
+ 791.3 |
+ 729.7 |
+ 821.7 |
+
+
+TensorFlow |
+ - |
+ - |
+ - |
+ - |
+
+
+
+
*The performance gap between Fuild and v2 comes from the network interference.*
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc31d098328bc237c018ebf8f158bdab5c37bff1
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ "--embedding_dim",
+ type=int,
+ default=512,
+ help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+ "--encoder_size",
+ type=int,
+ default=512,
+ help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+ "--decoder_size",
+ type=int,
+ default=512,
+ help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+ "--batch_size",
+ type=int,
+ default=16,
+ help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+ "--dict_size",
+ type=int,
+ default=30000,
+ help="The dictionary capacity. Dictionaries of source sequence and "
+ "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+ "--pass_num",
+ type=int,
+ default=2,
+ help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+ "--learning_rate",
+ type=float,
+ default=0.0002,
+ help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+ "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+ "--beam_size",
+ type=int,
+ default=3,
+ help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+ "--use_gpu",
+ type=distutils.util.strtobool,
+ default=True,
+ help="Whether to use gpu. (default: %(default)d)")
+parser.add_argument(
+ "--max_length",
+ type=int,
+ default=250,
+ help="The maximum length of sequence when doing generation. "
+ "(default: %(default)d)")
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+ def linear(inputs):
+ return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+ forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+ cell_t = fluid.layers.sums(input=[
+ fluid.layers.elementwise_mul(
+ x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+ x=input_gate, y=cell_tilde)
+ ])
+
+ hidden_t = fluid.layers.elementwise_mul(
+ x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+ return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+ target_dict_dim, is_generating, beam_size, max_length):
+ """Construct a seq2seq network."""
+
+ def bi_lstm_encoder(input_seq, gate_size):
+ # Linear transformation part for input gate, output gate, forget gate
+ # and cell activation vectors need be done outside of dynamic_lstm.
+ # So the output size is 4 times of gate_size.
+ input_forward_proj = fluid.layers.fc(input=input_seq,
+ size=gate_size * 4,
+ act=None,
+ bias_attr=False)
+ forward, _ = fluid.layers.dynamic_lstm(
+ input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+ input_reversed_proj = fluid.layers.fc(input=input_seq,
+ size=gate_size * 4,
+ act=None,
+ bias_attr=False)
+ reversed, _ = fluid.layers.dynamic_lstm(
+ input=input_reversed_proj,
+ size=gate_size * 4,
+ is_reverse=True,
+ use_peepholes=False)
+ return forward, reversed
+
+ src_word_idx = fluid.layers.data(
+ name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+ src_embedding = fluid.layers.embedding(
+ input=src_word_idx,
+ size=[source_dict_dim, embedding_dim],
+ dtype='float32')
+
+ src_forward, src_reversed = bi_lstm_encoder(
+ input_seq=src_embedding, gate_size=encoder_size)
+
+ encoded_vector = fluid.layers.concat(
+ input=[src_forward, src_reversed], axis=1)
+
+ encoded_proj = fluid.layers.fc(input=encoded_vector,
+ size=decoder_size,
+ bias_attr=False)
+
+ backward_first = fluid.layers.sequence_pool(
+ input=src_reversed, pool_type='first')
+
+ decoder_boot = fluid.layers.fc(input=backward_first,
+ size=decoder_size,
+ bias_attr=False,
+ act='tanh')
+
+ def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+ decoder_boot, decoder_size):
+ def simple_attention(encoder_vec, encoder_proj, decoder_state):
+ decoder_state_proj = fluid.layers.fc(input=decoder_state,
+ size=decoder_size,
+ bias_attr=False)
+ decoder_state_expand = fluid.layers.sequence_expand(
+ x=decoder_state_proj, y=encoder_proj)
+ concated = fluid.layers.concat(
+ input=[encoder_proj, decoder_state_expand], axis=1)
+ attention_weights = fluid.layers.fc(input=concated,
+ size=1,
+ act='tanh',
+ bias_attr=False)
+ attention_weights = fluid.layers.sequence_softmax(
+ input=attention_weights)
+ weigths_reshape = fluid.layers.reshape(
+ x=attention_weights, shape=[-1])
+ scaled = fluid.layers.elementwise_mul(
+ x=encoder_vec, y=weigths_reshape, axis=0)
+ context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+ return context
+
+ rnn = fluid.layers.DynamicRNN()
+
+ cell_init = fluid.layers.fill_constant_batch_size_like(
+ input=decoder_boot,
+ value=0.0,
+ shape=[-1, decoder_size],
+ dtype='float32')
+ cell_init.stop_gradient = False
+
+ with rnn.block():
+ current_word = rnn.step_input(target_embedding)
+ encoder_vec = rnn.static_input(encoder_vec)
+ encoder_proj = rnn.static_input(encoder_proj)
+ hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+ cell_mem = rnn.memory(init=cell_init)
+ context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+ decoder_inputs = fluid.layers.concat(
+ input=[context, current_word], axis=1)
+ h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+ rnn.update_memory(hidden_mem, h)
+ rnn.update_memory(cell_mem, c)
+ out = fluid.layers.fc(input=h,
+ size=target_dict_dim,
+ bias_attr=True,
+ act='softmax')
+ rnn.output(out)
+ return rnn()
+
+ if not is_generating:
+ trg_word_idx = fluid.layers.data(
+ name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+ trg_embedding = fluid.layers.embedding(
+ input=trg_word_idx,
+ size=[target_dict_dim, embedding_dim],
+ dtype='float32')
+
+ prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+ encoded_proj, decoder_boot,
+ decoder_size)
+ label = fluid.layers.data(
+ name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+ cost = fluid.layers.cross_entropy(input=prediction, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+ return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+ seq_lens = [len(seq) for seq in data]
+ cur_len = 0
+ lod = [cur_len]
+ for l in seq_lens:
+ cur_len += l
+ lod.append(cur_len)
+ flattened_data = np.concatenate(data, axis=0).astype("int64")
+ flattened_data = flattened_data.reshape([len(flattened_data), 1])
+ lod_t = core.LoDTensor()
+ lod_t.set(flattened_data, place)
+ lod_t.set_lod([lod])
+ return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+ dims = lod_tensor.get_dims()
+ ndarray = np.zeros(shape=dims).astype('float32')
+ for i in xrange(np.product(dims)):
+ ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+ return ndarray
+
+
+def train():
+ avg_cost, feeding_list = seq_to_seq_net(
+ args.embedding_dim,
+ args.encoder_size,
+ args.decoder_size,
+ args.dict_size,
+ args.dict_size,
+ False,
+ beam_size=args.beam_size,
+ max_length=args.max_length)
+
+ # clone from default main program
+ inference_program = fluid.default_main_program().clone()
+
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+ optimizer.minimize(avg_cost)
+
+ fluid.memory_optimize(fluid.default_main_program())
+
+ train_batch_generator = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+ batch_size=args.batch_size)
+
+ test_batch_generator = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+ batch_size=args.batch_size)
+
+ place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+ exe = Executor(place)
+ exe.run(framework.default_startup_program())
+
+ def do_validation():
+ total_loss = 0.0
+ count = 0
+ for batch_id, data in enumerate(test_batch_generator()):
+ src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+ trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+ lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+ fetch_outs = exe.run(inference_program,
+ feed={
+ feeding_list[0]: src_seq,
+ feeding_list[1]: trg_seq,
+ feeding_list[2]: lbl_seq
+ },
+ fetch_list=[avg_cost],
+ return_numpy=False)
+
+ total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+ count += 1
+
+ return total_loss / count
+
+ for pass_id in xrange(args.pass_num):
+ pass_start_time = time.time()
+ words_seen = 0
+ for batch_id, data in enumerate(train_batch_generator()):
+ src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+ words_seen += word_num
+ trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+ words_seen += word_num
+ lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+ fetch_outs = exe.run(framework.default_main_program(),
+ feed={
+ feeding_list[0]: src_seq,
+ feeding_list[1]: trg_seq,
+ feeding_list[2]: lbl_seq
+ },
+ fetch_list=[avg_cost])
+
+ avg_cost_val = np.array(fetch_outs[0])
+ print('pass_id=%d, batch_id=%d, train_loss: %f' %
+ (pass_id, batch_id, avg_cost_val))
+
+ pass_end_time = time.time()
+ test_loss = do_validation()
+ time_consumed = pass_end_time - pass_start_time
+ words_per_sec = words_seen / time_consumed
+ print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+ (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+ pass
+
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ if args.infer_only:
+ infer()
+ else:
+ train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7afaeb11447d936b65a1d83701b0176ecbc111
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+ parser = argparse.ArgumentParser("mnist model benchmark.")
+ parser.add_argument(
+ '--batch_size', type=int, default=128, help='The minibatch size.')
+ parser.add_argument(
+ '--iterations', type=int, default=35, help='The number of minibatches.')
+ parser.add_argument(
+ '--pass_num', type=int, default=5, help='The number of passes.')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='GPU',
+ choices=['CPU', 'GPU'],
+ help='The device type.')
+ parser.add_argument(
+ '--infer_only', action='store_true', help='If set, run forward only.')
+ parser.add_argument(
+ '--use_cprof', action='store_true', help='If set, use cProfile.')
+ parser.add_argument(
+ '--use_nvprof',
+ action='store_true',
+ help='If set, use nvprof for CUDA.')
+ args = parser.parse_args()
+ return args
+
+
+def print_arguments(args):
+ vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+ vars(args)['device'] == 'GPU')
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+def cnn_model(data):
+ conv_pool_1 = fluid.nets.simple_img_conv_pool(
+ input=data,
+ filter_size=5,
+ num_filters=20,
+ pool_size=2,
+ pool_stride=2,
+ act="relu")
+ conv_pool_2 = fluid.nets.simple_img_conv_pool(
+ input=conv_pool_1,
+ filter_size=5,
+ num_filters=50,
+ pool_size=2,
+ pool_stride=2,
+ act="relu")
+
+ # TODO(dzhwinter) : refine the initializer and random seed settting
+ SIZE = 10
+ input_shape = conv_pool_2.shape
+ param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+ scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+ predict = fluid.layers.fc(
+ input=conv_pool_2,
+ size=SIZE,
+ act="softmax",
+ param_attr=fluid.param_attr.ParamAttr(
+ initializer=fluid.initializer.NormalInitializer(
+ loc=0.0, scale=scale)))
+ return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+ test_reader = paddle.batch(
+ paddle.dataset.mnist.test(), batch_size=args.batch_size)
+ test_pass_acc = fluid.average.WeightedAverage()
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+ data)).astype(DTYPE)
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([len(y_data), 1])
+
+ acc, weight = exe.run(inference_program,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[batch_acc, batch_size_tensor])
+ test_pass_acc.add(value=acc, weight=weight)
+ pass_acc = test_pass_acc.eval()
+ return pass_acc
+
+
+def run_benchmark(model, args):
+ if args.use_cprof:
+ pr = cProfile.Profile()
+ pr.enable()
+ start_time = time.time()
+ # Input data
+ images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ predict = model(images)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(
+ input=predict, label=label, total=batch_size_tensor)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ # Optimization
+ opt = fluid.optimizer.AdamOptimizer(
+ learning_rate=0.001, beta1=0.9, beta2=0.999)
+ opt.minimize(avg_cost)
+
+ fluid.memory_optimize(fluid.default_main_program())
+
+ # Initialize executor
+ place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+ exe = fluid.Executor(place)
+
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # Reader
+ train_reader = paddle.batch(
+ paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+ accuracy = fluid.average.WeightedAverage()
+ for pass_id in range(args.pass_num):
+ accuracy.reset()
+ pass_start = time.time()
+ for batch_id, data in enumerate(train_reader()):
+ img_data = np.array(
+ map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([len(y_data), 1])
+
+ start = time.time()
+ outs = exe.run(
+ fluid.default_main_program(),
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+ ) # The accuracy is the accumulation of batches, but not the current batch.
+ accuracy.add(value=outs[1], weight=outs[2])
+ end = time.time()
+ loss = np.array(outs[0])
+ acc = np.array(outs[1])
+ print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+ (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+ pass_end = time.time()
+
+ train_avg_acc = accuracy.eval()
+ test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+ inference_program)
+
+ print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+ (pass_id, train_avg_acc, test_avg_acc,
+ (pass_end - pass_start) / 1000))
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ print_arguments(args)
+ if args.use_nvprof and args.device == 'GPU':
+ with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+ run_benchmark(cnn_model, args)
+ else:
+ run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0f1db979fa7fb640679beacafd66dfbe1f62ab8
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+ parser = argparse.ArgumentParser('Convolution model benchmark.')
+ parser.add_argument(
+ '--model',
+ type=str,
+ choices=['resnet_imagenet', 'resnet_cifar10'],
+ default='resnet_imagenet',
+ help='The model architecture.')
+ parser.add_argument(
+ '--batch_size', type=int, default=32, help='The minibatch size.')
+ parser.add_argument(
+ '--use_fake_data',
+ action='store_true',
+ help='use real data or fake data')
+ parser.add_argument(
+ '--skip_batch_num',
+ type=int,
+ default=5,
+ help='The first num of minibatch num to skip, for better performance test'
+ )
+ parser.add_argument(
+ '--iterations', type=int, default=80, help='The number of minibatches.')
+ parser.add_argument(
+ '--pass_num', type=int, default=100, help='The number of passes.')
+ parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data data_format, now only support NCHW.')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='GPU',
+ choices=['CPU', 'GPU'],
+ help='The device type.')
+ parser.add_argument(
+ '--data_set',
+ type=str,
+ default='flowers',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+ parser.add_argument(
+ '--infer_only', action='store_true', help='If set, run forward only.')
+ parser.add_argument(
+ '--use_cprof', action='store_true', help='If set, use cProfile.')
+ parser.add_argument(
+ '--use_nvprof',
+ action='store_true',
+ help='If set, use nvprof for CUDA.')
+ parser.add_argument(
+ '--with_test',
+ action='store_true',
+ help='If set, test the testset during training.')
+ args = parser.parse_args()
+ return args
+
+
+def print_arguments(args):
+ vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+ vars(args)['device'] == 'GPU')
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+ conv1 = fluid.layers.conv2d(
+ input=input,
+ filter_size=filter_size,
+ num_filters=ch_out,
+ stride=stride,
+ padding=padding,
+ act=None,
+ bias_attr=False)
+ return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+ ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+ if ch_in != ch_out:
+ return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+ else:
+ return input
+
+
+def basicblock(input, ch_out, stride):
+ short = shortcut(input, ch_out, stride)
+ conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+ conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+ return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+ short = shortcut(input, ch_out * 4, stride)
+ conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+ conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+ conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+ return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+ res_out = block_func(input, ch_out, stride)
+ for i in range(1, count):
+ res_out = block_func(res_out, ch_out, 1)
+ return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+ cfg = {
+ 18: ([2, 2, 2, 1], basicblock),
+ 34: ([3, 4, 6, 3], basicblock),
+ 50: ([3, 4, 6, 3], bottleneck),
+ 101: ([3, 4, 23, 3], bottleneck),
+ 152: ([3, 8, 36, 3], bottleneck)
+ }
+ stages, block_func = cfg[depth]
+ conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+ pool1 = fluid.layers.pool2d(
+ input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+ res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+ res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+ res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+ res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+ pool2 = fluid.layers.pool2d(
+ input=res4,
+ pool_size=7,
+ pool_type='avg',
+ pool_stride=1,
+ global_pooling=True)
+ out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+ return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+ assert (depth - 2) % 6 == 0
+
+ n = (depth - 2) // 6
+
+ conv1 = conv_bn_layer(
+ input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+ res1 = layer_warp(basicblock, conv1, 16, n, 1)
+ res2 = layer_warp(basicblock, res1, 32, n, 2)
+ res3 = layer_warp(basicblock, res2, 64, n, 2)
+ pool = fluid.layers.pool2d(
+ input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+ out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+ return out
+
+
+def run_benchmark(model, args):
+ if args.use_cprof:
+ pr = cProfile.Profile()
+ pr.enable()
+
+ if args.data_set == "cifar10":
+ class_dim = 10
+ if args.data_format == 'NCHW':
+ dshape = [3, 32, 32]
+ else:
+ dshape = [32, 32, 3]
+ else:
+ class_dim = 102
+ if args.data_format == 'NCHW':
+ dshape = [3, 224, 224]
+ else:
+ dshape = [224, 224, 3]
+
+ input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+ predict = model(input, class_dim)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(
+ input=predict, label=label, total=batch_size_tensor)
+
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+ opts = optimizer.minimize(avg_cost)
+
+ fluid.memory_optimize(fluid.default_main_program())
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ def test(exe):
+ test_accuracy = fluid.average.WeightedAverage()
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape(dshape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ acc, weight = exe.run(inference_program,
+ feed={"data": img_data,
+ "label": y_data},
+ fetch_list=[batch_acc, batch_size_tensor])
+ test_accuracy.add(value=acc, weight=weight)
+
+ return test_accuracy.eval()
+
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+ accuracy = fluid.average.WeightedAverage()
+ if args.use_fake_data:
+ data = train_reader().next()
+ image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+ 'float32')
+ label = np.array(map(lambda x: x[1], data)).astype('int64')
+ label = label.reshape([-1, 1])
+
+ iters, num_samples, start_time = 0, 0, time.time()
+ for pass_id in range(args.pass_num):
+ accuracy.reset()
+ train_accs = []
+ train_losses = []
+ for batch_id, data in enumerate(train_reader()):
+ if iters == args.skip_batch_num:
+ start_time = time.time()
+ num_samples = 0
+ if iters == args.iterations:
+ break
+ if not args.use_fake_data:
+ image = np.array(map(lambda x: x[0].reshape(dshape),
+ data)).astype('float32')
+ label = np.array(map(lambda x: x[1], data)).astype('int64')
+ label = label.reshape([-1, 1])
+ loss, acc, weight = exe.run(
+ fluid.default_main_program(),
+ feed={'data': image,
+ 'label': label},
+ fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+ iters += 1
+ num_samples += label[0]
+ accuracy.add(value=acc, weight=weight)
+ train_losses.append(loss)
+ train_accs.append(acc)
+ print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+ (pass_id, iters, loss, acc))
+ pass_train_acc = accuracy.eval()
+ # evaluation
+ if args.with_test:
+ pass_test_acc = test(exe)
+ train_elapsed = time.time() - start_time
+ print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+ (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+ examples_per_sec = num_samples / train_elapsed
+
+ print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+ (num_samples, train_elapsed, examples_per_sec))
+
+ if args.use_cprof:
+ pr.disable()
+ s = StringIO.StringIO()
+ sortby = 'cumulative'
+ ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+ ps.print_stats()
+ print(s.getvalue())
+
+
+if __name__ == '__main__':
+ model_map = {
+ 'resnet_imagenet': resnet_imagenet,
+ 'resnet_cifar10': resnet_cifar10
+ }
+ args = parse_args()
+ print_arguments(args)
+ if args.data_format == 'NHWC':
+ raise ValueError('Only support NCHW data_format now.')
+ if args.use_nvprof and args.device == 'GPU':
+ with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+ run_benchmark(model_map[args.model], args)
+ else:
+ run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..663e2efd5392a6cd1a71f51fa0d017070b489341
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+ if [ -z "$KMP_AFFINITY" ]; then
+ export KMP_AFFINITY="granularity=fine,compact,0,0"
+ fi
+ if [ -z "$OMP_DYNAMIC" ]; then
+ export OMP_DYNAMIC="FALSE"
+ fi
+else # HT is ON
+ if [ -z "$KMP_AFFINITY" ]; then
+ export KMP_AFFINITY="granularity=fine,compact,1,0"
+ fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+
+# vgg16
+# cifar10 gpu cifar10 128
+FLAGS_benchmark=true python fluid/vgg.py \
+ --device=GPU \
+ --batch_size=128 \
+ --skip_batch_num=5 \
+ --iterations=30 \
+ 2>&1 > vgg16_gpu_128.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true python fluid/resnet.py \
+ --device=GPU \
+ --batch_size=128 \
+ --data_set=cifar10 \
+ --model=resnet_cifar10 \
+ --skip_batch_num=5 \
+ --iterations=30 \
+ 2>&1 > resnet50_gpu_128.log
+
+# lstm
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e063549e0239abf9d946ed8735f0306203509d0
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+ parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ default=32,
+ help='The sequence number of a batch data. (default: %(default)d)')
+ parser.add_argument(
+ '--emb_dim',
+ type=int,
+ default=512,
+ help='Dimension of embedding table. (default: %(default)d)')
+ parser.add_argument(
+ '--hidden_dim',
+ type=int,
+ default=512,
+ help='Hidden size of lstm unit. (default: %(default)d)')
+ parser.add_argument(
+ '--pass_num',
+ type=int,
+ default=100,
+ help='Epoch number to train. (default: %(default)d)')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='CPU',
+ choices=['CPU', 'GPU'],
+ help='The device type.')
+ parser.add_argument(
+ '--crop_size',
+ type=int,
+ default=int(os.environ.get('CROP_SIZE', '1500')),
+ help='The max sentence length of input. Since this model use plain RNN,'
+ ' Gradient could be explored if sentence is too long')
+ args = parser.parse_args()
+ return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+ unk_value = word_dict['']
+
+ def __impl__():
+ for item in reader():
+ if len([x for x in item[0] if x != unk_value]) < crop_size:
+ yield item
+
+ return __impl__
+
+
+def main():
+ args = parse_args()
+ lstm_size = args.hidden_dim
+
+ data = fluid.layers.data(
+ name="words", shape=[1], lod_level=1, dtype='int64')
+ sentence = fluid.layers.embedding(
+ input=data, size=[len(word_dict), args.emb_dim])
+
+ sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+ rnn = fluid.layers.DynamicRNN()
+ with rnn.block():
+ word = rnn.step_input(sentence)
+ prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+ prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+ def gate_common(
+ ipt,
+ hidden,
+ size, ):
+ gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+ gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+ gate = fluid.layers.sums(input=[gate0, gate1])
+ return gate
+
+ forget_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ input_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ output_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ cell_gate = fluid.layers.tanh(
+ x=gate_common(word, prev_hidden, lstm_size))
+
+ cell = fluid.layers.sums(input=[
+ fluid.layers.elementwise_mul(
+ x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+ x=input_gate, y=cell_gate)
+ ])
+
+ hidden = fluid.layers.elementwise_mul(
+ x=output_gate, y=fluid.layers.tanh(x=cell))
+
+ rnn.update_memory(prev_cell, cell)
+ rnn.update_memory(prev_hidden, hidden)
+ rnn.output(hidden)
+
+ last = fluid.layers.sequence_pool(rnn(), 'last')
+ logit = fluid.layers.fc(input=last, size=2, act='softmax')
+ loss = fluid.layers.cross_entropy(
+ input=logit,
+ label=fluid.layers.data(
+ name='label', shape=[1], dtype='int64'))
+ loss = fluid.layers.mean(x=loss)
+
+ # add acc
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+ shape=[1], dtype='int64'), total=batch_size_tensor)
+
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ adam = fluid.optimizer.Adam()
+ adam.minimize(loss)
+
+ fluid.memory_optimize(fluid.default_main_program())
+
+ place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+
+ def train_loop(pass_num, crop_size):
+ with profiler.profiler(args.device, 'total') as prof:
+ for pass_id in range(pass_num):
+ train_reader = batch(
+ paddle.reader.shuffle(
+ crop_sentence(imdb.train(word_dict), crop_size),
+ buf_size=25000),
+ batch_size=args.batch_size)
+ word_nums = 0
+ pass_start_time = time.time()
+ for batch_id, data in enumerate(train_reader()):
+ tensor_words = to_lodtensor([x[0] for x in data], place)
+ for x in data:
+ word_nums += len(x[0])
+ label = numpy.array([x[1] for x in data]).astype("int64")
+ label = label.reshape((-1, 1))
+ loss_np, acc, weight = exe.run(
+ fluid.default_main_program(),
+ feed={"words": tensor_words,
+ "label": label},
+ fetch_list=[loss, batch_acc, batch_size_tensor])
+ print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+ (pass_id, batch_id, loss_np, acc))
+
+ pass_end_time = time.time()
+ time_consumed = pass_end_time - pass_start_time
+ words_per_sec = word_nums / time_consumed
+ print("pass_id=%d, sec/pass: %f, words/s: %f" %
+ (pass_id, time_consumed, words_per_sec))
+
+ train_loop(args.pass_num, args.crop_size)
+
+
+def to_lodtensor(data, place):
+ seq_lens = [len(seq) for seq in data]
+ cur_len = 0
+ lod = [cur_len]
+ for l in seq_lens:
+ cur_len += l
+ lod.append(cur_len)
+ flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+ flattened_data = flattened_data.reshape([len(flattened_data), 1])
+ res = fluid.LoDTensor()
+ res.set(flattened_data, place)
+ res.set_lod([lod])
+ return res
+
+
+if __name__ == '__main__':
+ main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf78e4cf08d43127a05c740fa30ca6d2bc416b0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+ '--skip_batch_num',
+ type=int,
+ default=5,
+ help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+ '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+ '--learning_rate',
+ type=float,
+ default=1e-3,
+ help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+ '--device',
+ type=str,
+ default='GPU',
+ choices=['CPU', 'GPU'],
+ help="The device type.")
+parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data order, now only support NCHW.')
+parser.add_argument(
+ '--data_set',
+ type=str,
+ default='cifar10',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+parser.add_argument(
+ '--with_test',
+ action='store_true',
+ help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+ def conv_block(input, num_filter, groups, dropouts):
+ return fluid.nets.img_conv_group(
+ input=input,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act='relu',
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type='max')
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0])
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+ fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+ bn = fluid.layers.batch_norm(input=fc1, act='relu')
+ drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+ fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+ return fc2
+
+
+def main():
+ if args.data_set == "cifar10":
+ classdim = 10
+ if args.data_format == 'NCHW':
+ data_shape = [3, 32, 32]
+ else:
+ data_shape = [32, 32, 3]
+ else:
+ classdim = 102
+ if args.data_format == 'NCHW':
+ data_shape = [3, 224, 224]
+ else:
+ data_shape = [224, 224, 3]
+
+ # Input data
+ images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ net = vgg16_bn_drop(images)
+ predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(
+ input=predict, label=label, total=batch_size_tensor)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ # Optimization
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+ opts = optimizer.minimize(avg_cost)
+
+ fluid.memory_optimize(fluid.default_main_program())
+
+ # Initialize executor
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ exe = fluid.Executor(place)
+
+ # Parameter initialization
+ exe.run(fluid.default_startup_program())
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ # test
+ def test(exe):
+ test_accuracy = fluid.average.WeightedAverage()
+ for batch_id, data in enumerate(test_reader()):
+ img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ acc, weight = exe.run(inference_program,
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[batch_acc, batch_size_tensor])
+ test_accuracy.add(value=acc, weight=weight)
+ return test_accuracy.eval()
+
+ iters, num_samples, start_time = 0, 0, time.time()
+ accuracy = fluid.average.WeightedAverage()
+ for pass_id in range(args.pass_num):
+ accuracy.reset()
+ train_accs = []
+ train_losses = []
+ for batch_id, data in enumerate(train_reader()):
+ if iters == args.skip_batch_num:
+ start_time = time.time()
+ num_samples = 0
+ if iters == args.iterations:
+ break
+ img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+ data)).astype("float32")
+ y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+ y_data = y_data.reshape([-1, 1])
+
+ loss, acc, weight = exe.run(
+ fluid.default_main_program(),
+ feed={"pixel": img_data,
+ "label": y_data},
+ fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+ accuracy.add(value=acc, weight=weight)
+ iters += 1
+ num_samples += len(data)
+ print(
+ "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+ (pass_id, iters, loss, acc)
+ ) # The accuracy is the accumulation of batches, but not the current batch.
+
+ pass_train_acc = accuracy.eval()
+ train_losses.append(loss)
+ train_accs.append(acc)
+ # evaluation
+ if args.with_test:
+ pass_test_acc = test(exe)
+ train_elapsed = time.time() - start_time
+ print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+ (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+
+def print_arguments():
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+ print_arguments()
+ main()
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ae35d8c53476b34cb18331364267dd7c8b94dd64..22e6fb13d7320986a60bc1ef5530187e0970c767 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -494,6 +494,12 @@ reshape
.. autofunction:: paddle.fluid.layers.reshape
:noindex:
+pad
+---
+
+.. autofunction:: paddle.fluid.layers.pad
+ :noindex:
+
scale
-----
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..53d601d3a9a37e8adad519833bb6fa2dc48023a0 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -7,7 +7,7 @@ Polyak and Juditsky (1992) showed that the test performance of simple average of
Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows:
-
+![](./images/asgd.gif)
We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index ed3f5aab2882c16ca6ac1446b4c4d4d27a373af7..8ded0ad22f4013a521bf3bee260565dc5cf855ae 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!
I think we need only the following few CMake functions to make a project description mean and clean:
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+
+
+
+C++ |
+CUDA C++ |
+Go |
+
+
+
+
+cc_library |
+nv_library |
+go_library |
+
+
+cc_binary |
+nv_binary |
+go_binary |
+
+
+ cc_test |
+ nv_test |
+ go_test |
+
+
+
+
- The `_library` functions generate .a files from source code.
- The `_binary` functions generate executable binary files.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def557fd472ac4d679c73447bd9107d1190..3b626bd89cd83a9428997abccfeeebbbbdbb3d38 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
-| programming languages | PaddlePaddle |
-|-----------------------|-----------------------|
-| for, while loop | RNN, WhileOp |
-| if, if-else, switch | IfElseOp, SwitchOp |
-| sequential execution | a sequence of layers |
+
+
+
+programming languages |
+PaddlePaddle |
+
+
+
+
+for, while loop |
+RNN, WhileOp |
+
+
+if, if-else, switch |
+IfElseOp, SwitchOp |
+
+
+sequential execution |
+a sequence of layers |
+
+
+
+
A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-| programming languages | PaddlePaddle |
-|-----------------------|---------------------------------|
-| stack | scope hierarchy |
-| stack frame | scope |
-| push at entering block| push at entering block |
-| pop at leaving block | destroy when minibatch completes|
+
+
+
+programming languages |
+PaddlePaddle |
+
+
+
+
+stack |
+scope hierarchy |
+
+
+stack frame |
+scope |
+
+
+push at entering block |
+push at entering block |
+
+
+pop at leaving block |
+destroy when minibatch completes |
+
+
+
+
1. In traditional programs:
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..30bc488a18a28d349645d9d2502aae6691a69931 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
-
-| C++ functions/functors | mul | add | | |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class | mulOp | addOp | FCOp | |
-| Python binding | operator.mul | operator.add | operator.fc | |
-| Python function | | | | layer.fc |
+
+
+
+C++ functions/functors |
+mul |
+add |
+ |
+ |
+
+
+
+
+C++ operator class |
+mulOp |
+addOp |
+FCOp |
+ |
+
+
+Python binding |
+operator.mul |
+ operator.add |
+operator.fc |
+ |
+
+
+Python function |
+ |
+ |
+ |
+layer.fc |
+
+
+
This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..a88292e7888d0ebc64ee89ca315dfea38a12c71d 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
-| | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN | Support | Support |
-| recursive RNN | Support | Support |
-| padding zeros | Must | No need |
-| blob data type | Tensor | LoDTensor |
+
+
+
+ |
+TensorFlow |
+PaddlePaddle |
+
+
+
+
+RNN |
+Support |
+Support |
+
+
+recursive RNN |
+Support |
+Support |
+
+
+padding zeros |
+ Must |
+No need |
+
+
+ blob data type |
+ Tensor |
+ LoDTensor |
+
+
+
+
PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor.
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index fcba08c07f40177d54a91048cb616198402a9d5d..6750323c0167bf1efbde6ef4fd670e88a5aa502a 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+
+
+
+ |
+compile time |
+runtime |
+
+
+
+
+Data |
+VarDesc(proto) |
+Variable(cpp) |
+
+
+Operation |
+OpDesc(proto) |
+Operator(cpp) |
+
+
+
## Definition of VarType
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..64602166065af28309d7a01fdeb7076a9b0a081a 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,38 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
The following table compares concepts in Fluid and Go
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+
## An Example Concurrent Program
@@ -77,11 +103,11 @@ message ProgramDesc {
read(output = X)
kube_get_workers_addrs(output = L)
Y = tensor_array(len(L))
- parallel_for(input = X, output = Y,
+ parallel_for(input = X, output = Y,
attrs = {L, block_id(1)}) # referring to block 1
]
}
-
+
block[1] = Block {
parent = 0,
vars = [x, y, index],
@@ -102,7 +128,7 @@ func main() { //// block 0
X = fluid.read(...)
L = fluid.k8s.get_worker_addrs()
Y = fluid.tensor_array(len(L))
- fluid.parallel_for(X, L,
+ fluid.parallel_for(X, L,
func(index int) { //// block 1
x = X[index]
fluid.send(L[index], x)
@@ -116,7 +142,7 @@ An explanation of the above program:
- `fluid.k8s` is a package that provides access to Kubernetes API.
- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860fab7e09241e968a63526c7d86d3e568..66d19f44baf861c7847e81ca83f61024ec877faf 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
There were many concurrent programming models, implemented in various forms:
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+
+
+concurrent programming model |
+implementation |
+
+
+
+
+mutex |
+types and functions in standard libraries |
+
+
+semaphore |
+ types and functions in standard libraries |
+
+
+ communicating sequential processes (CSP) |
+ Go programming language |
+
+
+ actor model |
+ Erlang programming language |
+
+
+ message passing |
+ MPI |
+
+
+ bulk synchronous parallel (BSP) |
+ Pregel distributed programming framework |
+
+
+
+
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
```go
close(ch)
```
-
+
Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-
+
There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b..f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -2,12 +2,33 @@
Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+
+
+
+Python classes |
+Protobuf messages |
+
+
+
+
+Program |
+ProgramDesc |
+
+
+Block |
+BlockDesc |
+
+
+Operator |
+OpDesc |
+
+
+Variable |
+VarDesc |
+
+
+
+
Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf12ac8328fb3a913e4386e75d63c995..5e147f8263e685a4665b5793f7127178cbc3cfdd 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution
Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+
+
+Existed since |
+model as sequence of layers |
+model as graph of operators |
+No model |
+
+
+
+
+2013 |
+Caffe, Theano, Torch, PaddlePaddle |
+ |
+ |
+
+
+2015 |
+ |
+TensorFlow, MxNet, Caffe2, ONNX, n-graph |
+ |
+
+
+2016 |
+ |
+ |
+ PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+
+
+
From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 7c39fabcc6df76afdb6a77b4cbc2edf0bf3ef780..f199cc892f5e84f0a12abe3b8e5cace9849e7fa8 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
At runtime, the C++ program realizes the graph and runs it.
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+
+
+
+ |
+Representation (protobuf messages) |
+Realization (C++ class objects) |
+
+
+
+
+Data |
+
+VarDesc |
+
+Variable |
+
+
+Operation |
+
+OpDesc |
+
+Operator |
+
+
+Block |
+BlockDesc |
+Block |
+
+
+
+
The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36feba9e0239e7a5f81a8dc9108b2471a..7f5dcf55f9f2a0fd27ffde100510dd8fee305381 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc
+# DeepSpeech2 on PaddlePaddle: Design Doc
We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
Tasks parallelizable within phases:
-Roadmap | Description | Parallelizable Tasks
------------ | :------------------------------------ | :--------------------
-Phase I | Simplified model & components | *Task 1* ~ *Task 8*
-Phase II | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III | Documentations | *Task13* ~ *Task14*
+
+
+
+Roadmap |
+Description |
+ Parallelizable Tasks |
+
+
+
+
+Phase I |
+Simplified model & components |
+Task 1 ~ Task 8 |
+
+
+Phase II |
+ Standard model & benchmarking & profiling |
+Task 9 ~ Task 12 |
+
+
+Phase III |
+ Documentations |
+ Task13 ~ Task14 |
+
+
+
+
Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
@@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar
Key ingredients about the layers:
-- **Data Layers**:
+- **Data Layers**:
- Frame sequences data of audio **spectrogram** (with FFT).
- - Token sequences data of **transcription** text (labels).
+ - Token sequences data of **transcription** text (labels).
- These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**:
+- **2D Convolution Layers**:
- Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
- With striding for only the first convlution layer.
- No pooling for all convolution layers.
-- **Uni-directional RNNs**
+- **Uni-directional RNNs**
- Uni-directional + row convolution: for low-latency inference.
- Bi-direcitional + without row convolution: if we don't care about the inference latency.
- **Row convolution**:
- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
- - Not nessesary if with bi-direcitional RNNs.
+ - Not nessesary if with bi-direcitional RNNs.
- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
- **Batch Normalization Layers**:
- Added to all above layers (except for data and loss layer).
- Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
-
-
-Required Components | PaddlePaddle Support | Need to Develop
-:------------------------------------- | :-------------------------------------- | :-----------------------
-Data Layer I (Spectrogram) | Not supported yet. | TBD (Task 3)
-Data Layer II (Transcription) | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer | `paddle.layer.image_conv_layer` | -
-DataType Converter (vec2seq) | `paddle.layer.block_expand` | -
-Bi-/Uni-directional RNNs | `paddle.layer.recurrent_group` | -
-Row Convolution Layer | Not supported yet. | TBD (Task 4)
-CTC-loss Layer | `paddle.layer.warp_ctc` | -
-Batch Normalization Layer | `paddle.layer.batch_norm` | -
-CTC-Beam search | Not supported yet. | TBD (Task 6)
+
+
+
+
+Required Components |
+ PaddlePaddle Support |
+ Need to Develop |
+
+
+
+
+Data Layer I (Spectrogram) |
+Not supported yet. |
+TBD (Task 3) |
+
+
+Data Layer II (Transcription) |
+ paddle.data_type.integer_value_sequence |
+ - |
+
+
+2D Convolution Layer |
+ paddle.layer.image_conv_layer |
+ - |
+
+
+DataType Converter (vec2seq) |
+ paddle.layer.block_expand |
+ - |
+
+
+Bi-/Uni-directional RNNs |
+paddle.layer.recurrent_group |
+ - |
+
+
+Row Convolution Layer |
+Not supported yet. |
+TBD (Task 4) |
+
+
+CTC-loss Layer |
+paddle.layer.warp_ctc |
+ - |
+
+
+Batch Normalization Layer |
+paddle.layer.batch_norm |
+ - |
+
+
+CTC-Beam search |
+Not supported yet. |
+ TBD (Task 6) |
+
+
+
+
### Row Convolution
@@ -145,14 +212,14 @@ TODO by Assignees
Figure 2. Algorithm for CTC Beam Search Decoder.
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+ - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
-
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
## Future Work
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index e70bf5dff3849f2ff82315f7eba4a92c93539843..f627437f354a12c79cad25c959409db29ecbd874 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,9 +4,9 @@
.. toctree::
:maxdepth: 1
- new_op_en.md
- new_op_kernel_en.md
- use_eigen_en.md
+ new_op_cn.md
+ new_op_kernel.md
+ use_eigen_cn.md
name_convention.md
support_new_device.md
releasing_process.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index f0e9afcfcc9edfb9a91f58375cd415ea414f8f82..0b65fed67ad45eb399b624184485a99a082d79e9 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -5,7 +5,7 @@ Development
:maxdepth: 1
new_op_en.md
- new_op_kernel_en.md
+ new_op_kernel.md
use_eigen_en.md
name_convention.md
support_new_device.md
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..0c3f88d9c31e05bec399c64bf6ade56e62e01f68 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
-
- 内容 | 定义位置
--------------- | :----------------------
-OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
-Op定义 | `.cc`文件
-Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
-注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
+
+
+
+内容 |
+定义位置 |
+
+
+
+
+OpProtoMake定义 |
+`.cc`文件,Backward Op不需要定义OpProtoMake |
+
+
+Op定义 |
+ `.cc`文件 |
+
+
+Kernel实现 |
+ CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 |
+
+
+注册Op |
+ Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 |
+
+
+
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..a566a09131f86251b70d5435d0a483aa2a705b35 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition | `.cc` files
Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
+
+
+
+Information |
+ Where is it defined |
+
+
+
+
+OpProtoMake definition |
+ `.cc`files, Backward Op does not need an OpProtoMake interface. |
+
+
+Op definition |
+ `.cc` files |
+
+
+Kernel implementation |
+ The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. |
+
+
+Registering the Op |
+ Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. |
+
+
+
+
New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
def test_check_output(self):
self.check_output()
-
+
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 100%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index b9787261092f1f27377886152cb1596d9ff54188..0810765b85f73d9dba876e66fb43bb1ad476d6d2 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -66,7 +66,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
- * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
+ * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
@@ -78,13 +78,116 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU | | | | | | | | |
-| API.V2 + Docker + CPU | | | | | | | | |
-| `paddle_trainer` + Docker + GPU | | | | | | | | |
-| `paddle_trainer` + Docker + CPU | | | | | | | | |
-| API.V2 + Ubuntu + GPU | | | | | | | | |
-| API.V2 + Ubuntu + CPU | | | | | | | | |
-| `paddle_trainer` + Ubuntu + GPU | | | | | | | | |
-| `paddle_trainer` + Ubuntu + CPU | | | | | | | | |
+
+
+
+ |
+新手入门章节 |
+ 识别数字 |
+ 图像分类 |
+词向量 |
+ 情感分析 |
+语意角色标注 |
+ 机器翻译 |
+个性化推荐 |
+
+
+
+
+
+API.V2 + Docker + GPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+ API.V2 + Docker + CPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+`paddle_trainer` + Docker + GPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+`paddle_trainer` + Docker + CPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+ API.V2 + Ubuntu + GPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+API.V2 + Ubuntu + CPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+ `paddle_trainer` + Ubuntu + GPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
+ `paddle_trainer` + Ubuntu + CPU |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+
+
+
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf775939c9f7a8b49d850d523e6e5a45..1f12ba0497369eacc6a2db7984781b5672f45ea1 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
-As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
+As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
## Implementation
-The topology is saved as a plain text in a detailed self-contain protobuf file.
+The topology is saved as a plain text in a detailed self-contain protobuf file.
The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
-|field name | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t* | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+
+
+
+field name |
+type |
+description |
+
+
+
+
+ version |
+ uint32_t |
+ Version of saved file. Always 0 now. |
+
+
+ tensor desc length |
+ uint32_t |
+ TensorDesc(Protobuf message) length in bytes. |
+
+
+tensor desc |
+ void* |
+ TensorDesc protobuf binary message |
+
+
+ tensor data |
+ void* |
+ Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
+
+
+ lod_level |
+ uint64_t |
+ Level of LoD |
+
+
+ length of lod[0] |
+ uint64_t |
+ [Optional] length of lod[0] in bytes. |
+
+
+ data of lod[0] |
+ uint64_t* |
+ [Optional] lod[0].data() |
+
+
+... |
+ ... |
+ ... |
+
+
+
## Summary
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index 1b6f767869aaa800c122c8e7a06a1413e48e10e0..b99b90056b0a2e51f2668a6d27d94857bdc09c37 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -65,10 +65,10 @@ exit(1)
**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。**
-### 分布式训练
+### 分布式训练
Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
```python
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
```
将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下:
```python
@@ -99,15 +99,51 @@ for pass_id in range(100):
### 分布式训练脚本运行说明
分布式任务的运行需要将表格中说明的多个参数进行赋值:
-| 参数名 | 值类型 | 说明 | 示例 |
-|:-------------|:------|:---------------------------------------|:-------------|
-| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 |
-| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
-| trainers | int | 训练节点的总个数,>0的数字 | 4 |
-| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
-| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER |
-
-**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
+
+
+
+参数名 |
+ 值类型 |
+说明 |
+ 示例 |
+
+
+
+
+trainer_id |
+ int |
+ 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 |
+ 0/1/2/3 |
+
+
+pservers |
+ str |
+ parameter server 列表 |
+ 127.0.0.1:6710,127.0.0.1:6711 |
+
+
+trainers |
+int |
+ 训练节点的总个数,>0的数字 |
+ 4 |
+
+
+ server_endpoint |
+ str |
+ 当前所起的服务节点的IP:PORT |
+ 127.0.0.1:8789 |
+
+
+ training_role |
+str |
+ 节点角色, TRAINER/PSERVER |
+ PSERVER |
+
+
+
+
+
+**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
```python
t = fluid.DistributeTranspiler()
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 17f895573a65731db34b2addddaa22e7f32157ec..8266dec3c6125a09b90ac0ccd4aa5464f5c7db31 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
每一列的含义是:
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号,函数名 |
+
+
+
+列名 |
+含义 |
+
+
+
+
+ ncalls |
+ 函数的调用次数 |
+
+
+tottime |
+ 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+
+
+ percall |
+ tottime的每次调用平均时间 |
+
+
+ cumtime |
+ 函数总时间。包含这个函数调用其他函数的时间 |
+
+
+ percall |
+ cumtime的每次调用平均时间 |
+
+
+ filename:lineno(function) |
+ 文件名, 行号,函数名 |
+
+
+
### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index abe4493c175fb4ee57f1acf45931e2890620d9c1..e95556dd608b7ff0a3eb18873df0015a2da94e7c 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -57,14 +57,40 @@ port, we will see the output like the following:
where each line corresponds to Python function, and the meaning of
each column is as follows:
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+
+
+
+column |
+meaning |
+
+
+
+
+ ncalls |
+ the number of calls into a function |
+
+
+tottime |
+ the total execution time of the function, not including the execution time of other functions called by the function |
+
+
+ percall |
+ tottime divided by ncalls |
+
+
+ cumtime |
+ the total execution time of the function, including the execution time of other functions being called |
+
+
+ percall |
+ cumtime divided by ncalls |
+
+
+ filename:lineno(function) |
+ where the function is define |
+
+
+
### Identify Performance Bottlenecks
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+.. contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+.. code-block:: bash
+
+ $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+ $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+ $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation `_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+ .. code-block:: bash
+
+ cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR=
+
+You should specify ````, ````, ```` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code `paddle version` or `cmake ..`
+
+.. code-block:: bash
+
+ CMake Warning at cmake/version.cmake:20 (message):
+ Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+.. code-block:: bash
+
+ pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+.. code-block:: bash
+
+ pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7. Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+.. code-block:: bash
+
+ 24 - test_PyDataProvider (Failed)
+ 26 - test_RecurrentGradientMachine (Failed)
+ 27 - test_NetworkCompare (Failed)
+ 28 - test_PyDataProvider2 (Failed)
+ 32 - test_Prediction (Failed)
+ 33 - test_Compare (Failed)
+ 34 - test_Trainer (Failed)
+ 35 - test_TrainerOnePass (Failed)
+ 36 - test_CompareTwoNets (Failed)
+ 37 - test_CompareTwoOpts (Failed)
+ 38 - test_CompareSparse (Failed)
+ 39 - test_recurrent_machine_generation (Failed)
+ 40 - test_PyDataProviderWrapper (Failed)
+ 41 - test_config_parser (Failed)
+ 42 - test_swig_api (Failed)
+ 43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+.. code-block:: bash
+
+ paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+ Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory. Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+.. code-block:: bash
+
+ make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+ make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+ make[1]: *** waiting for the unfinished jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+.. code-block:: bash
+
+ // 1. enter the directory
+ cd build/third_party/mklml/src/extern_mklml
+
+ // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+ du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+ // 3. manually download and unzip and make the download success tag:
+ wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz
+ tar zxf mklml_lnx_2018.0.1.20171007.tgz
+ touch ../extern_mklml-stamp/extern_mklml-download
+
+ // 4. then compile
+
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index a1b913a863cc1853ea3a786d22e6e8baa8c98a02..c277bd7cb69bba899296efe64107ee538c4aa847 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,6 +55,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
auto graph = new SSAGraph();
SSAGraph &result = *graph;
+ std::unordered_set og_has_been_broadcast;
result.vars_.resize(places_.size());
bool is_forwarding = true;
@@ -122,9 +123,15 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
+ // Currently, we assume that once gradient is generated, it can be
+ // broadcast, and each gradient is only broadcast once. But there are no
+ // other cases, for example, we need to adjust the gradient according to
+ // the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
- if (grad_names_.count(og) != 0) { // is param grad
- // Insert NCCL AllReduce Op
+ if (grad_names_.count(og) != 0 &&
+ og_has_been_broadcast.count(og) == 0) { // is param grad
+ // Insert NCCL AllReduce Op
+ og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 5ddf331cfca39a4e81a42d9ff8efd5af7bcf6829..55b5f113589e090386d287e228349f22fb94a7ab 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
}
-std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
} // namespace details
} // namespace framework
} // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index 045070bb6a97e90600cd24d9f43cd2a10a4bc1f5..ad14a3c5cb4625fa121cad2daed389c441e78771 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -14,6 +14,9 @@
#pragma once
+#include
+#include
+
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
@@ -34,6 +37,10 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
std::string Name() const override;
+ // Delay and buffer nccl_all_reduce together can significantly increase
+ // performance. Disable this feature by returning false.
+ bool IsMultiDeviceTransfer() override { return true; };
+
protected:
void RunImpl() override;
};
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 71672fd24c65ee654fb9f703ea5808c31ee8fbb0..d7a541ac4bb83625060db337446d03a1afda3ed0 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
+#include
+#include
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/platform/device_context.h"
@@ -53,6 +55,10 @@ class OpHandleBase {
void AddOutput(VarHandleBase *out);
+ // If the Op involves data transfer of multiple devices that
+ // will likely block other computations.
+ virtual bool IsMultiDeviceTransfer() { return false; }
+
protected:
virtual void RunImpl() = 0;
};
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 3f8655147b688239509dea98925df310a46cbef8..1f96b9dc6235a18f7566c98cca60baa964e6aa56 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,22 +23,36 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
size_t num_threads, bool use_event,
const std::vector &local_scopes,
const std::vector &places,
- std::unique_ptr &&graph)
+ std::unique_ptr &&graph, bool allow_op_delay)
: SSAGraphExecutor(std::move(graph)),
pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
local_scopes_(local_scopes),
places_(places),
fetch_ctxs_(places),
- use_event_(use_event) {}
+ use_event_(use_event),
+ running_ops_(0),
+ allow_op_delay_(allow_op_delay) {}
+
+void ThreadedSSAGraphExecutor::RunDelayedOps(
+ const std::unordered_set &delayed_ops) {
+ for (auto op : delayed_ops) {
+ op->Run(use_event_);
+ }
+}
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector &fetch_tensors) {
std::unordered_map pending_ops;
std::unordered_set pending_vars;
-
BlockingQueue ready_vars;
-
std::unordered_set ready_ops;
+ // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+ // streams from multiple GPUs, it's faster to buffer them and schedule
+ // together since we currently cannot overlap computation and memcpy streams.
+ // Should revisit it if overlapping is available.
+ std::unordered_set delayed_ops;
+ std::unordered_set blocked_by_delayed_ops;
+ std::unordered_set delayed_vars;
auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
pending_vars.insert(&var);
@@ -106,7 +120,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto run_all_ready_ops = [&] {
for (auto *op : ready_ops) {
- RunOp(ready_vars, op);
+ if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+ delayed_ops.insert(op);
+ delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
+ ready_vars.Extend(op->outputs_);
+ continue;
+ }
+ running_ops_++;
+ RunOp(&ready_vars, op);
}
ready_ops.clear();
};
@@ -118,13 +139,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
// Step 3. Execution
- while (!pending_vars.empty()) {
+ while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
// 1. Run All Ready ops
run_all_ready_ops();
// 2. Find ready variable
bool timeout;
- auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
+ auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) {
if (exception_) {
@@ -141,13 +162,29 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
- ready_ops.insert(op);
+ if (delayed_vars.find(ready_var) != delayed_vars.end()) {
+ blocked_by_delayed_ops.insert(op);
+ } else {
+ ready_ops.insert(op);
+ }
}
}
}
+ // When there are no other ops to schedule, schedule buffered delayed
+ // ops and unblock other ops.
+ if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
+ RunDelayedOps(delayed_ops);
+ delayed_ops.clear();
+ for (auto *op : blocked_by_delayed_ops) {
+ ready_ops.insert(op);
+ }
+ blocked_by_delayed_ops.clear();
+ }
// Keep loop until all vars are ready.
}
-
+ PADDLE_ENFORCE(ready_ops.empty());
+ PADDLE_ENFORCE(delayed_ops.empty());
+ PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
++computation_count_;
auto sync_computation = [&] {
@@ -182,12 +219,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
void ThreadedSSAGraphExecutor::RunOp(
- BlockingQueue &ready_var_q, details::OpHandleBase *op) {
- auto op_run = [&ready_var_q, op, this] {
+ BlockingQueue *ready_var_q, details::OpHandleBase *op) {
+ auto op_run = [ready_var_q, op, this] {
try {
VLOG(10) << op->Name() << " : " << op->DebugString();
op->Run(use_event_);
- ready_var_q.Extend(op->outputs_);
+ running_ops_--;
+ ready_var_q->Extend(op->outputs_);
} catch (platform::EnforceNotMet ex) {
exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 2ea57ac8f96bc9c2b5c98bcd25d9ce921c3683cd..79cfc26b461a39811a9a125e5aeac3492d967386 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,7 +14,12 @@
#pragma once
-#include
+#include
+#include
+#include
+#include
+#include
+
#include
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -70,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
const std::vector &local_scopes,
const std::vector &places,
- std::unique_ptr &&graph);
+ std::unique_ptr &&graph,
+ bool allow_op_delay);
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
@@ -79,9 +85,11 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~ThreadedSSAGraphExecutor() {}
private:
- void RunOp(BlockingQueue &ready_var_q,
+ void RunOp(BlockingQueue *ready_var_q,
details::OpHandleBase *op);
+ void RunDelayedOps(const std::unordered_set &delayed_ops);
+
private:
std::unique_ptr<::ThreadPool> pool_;
std::vector local_scopes_;
@@ -89,6 +97,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
platform::DeviceContextPool fetch_ctxs_;
const bool use_event_;
std::unique_ptr exception_;
+ std::atomic running_ops_;
+ bool allow_op_delay_;
size_t computation_count_{0};
size_t max_async_computation{100};
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 893cc15f6c8b34fcfc33554f8ef48ffeb00cd75c..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -22,7 +22,7 @@
namespace paddle {
namespace framework {
namespace details {
-struct OpHandleBase;
+class OpHandleBase;
// VarHandleBase is the var node in the dependency graph.
// A variable can only be generated by a single operator. i.e.
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 91f2db9354c2a00ec7e51ea4595c7cfa00da23ea..17885143247f0e0db8f12931e3c3412e7114ef3d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
-#include
+#include "paddle/fluid/platform/profiler.h"
-#include "ThreadPool.h"
+#include
+#include
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
@@ -47,7 +48,7 @@ ParallelExecutor::ParallelExecutor(
const std::vector &places,
const std::unordered_set ¶ms,
const ProgramDesc &startup_program, const ProgramDesc &main_program,
- const std::string &loss_var_name, Scope *scope)
+ const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope;
@@ -82,8 +83,8 @@ ParallelExecutor::ParallelExecutor(
auto graph = builder.Build(main_program);
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
- num_threads, use_event, member_->local_scopes_, places,
- std::move(graph)));
+ num_threads, use_event, member_->local_scopes_, places, std::move(graph),
+ allow_op_delay));
// Step 3. Create vars in each scope;
for (auto *scope : member_->local_scopes_) {
@@ -151,6 +152,7 @@ void ParallelExecutor::BCastParamsToGPUs(
void ParallelExecutor::Run(const std::vector &fetch_tensors,
const std::string &fetched_var_name) {
+ platform::RecordBlock b(0);
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable() =
fetch_data;
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 503efa2e447b0ac70f6302aa0a89cc55e5afcb81..964b476234e622cae934d41bc3793bc3114a5f1a 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,8 +14,9 @@ limitations under the License. */
#pragma once
-#include
+#include
#include
+#include
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
@@ -37,7 +38,8 @@ class ParallelExecutor {
const std::unordered_set& params,
const ProgramDesc& startup_program,
const ProgramDesc& main_program,
- const std::string& loss_var_name, Scope* scope);
+ const std::string& loss_var_name, Scope* scope,
+ bool allow_op_delay);
void Run(const std::vector& fetch_tensors,
const std::string& fetched_var_name = "fetched_var");
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..9458d56a01df432aea573d796456b9be31350038 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+
+#include
+
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h"
@@ -52,7 +55,7 @@ class SelectedRows {
private:
// Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
- // SelectedRows are simplely concated when adding together. Until a
+ // SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled.
Vector rows_;
std::unique_ptr value_{nullptr};
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index d59411dfb9122537e99f483478fdac06fc8275db..3adeeda90645ca983d9d9229b4cc1c4c90302206 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
- set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+ set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc)
+ cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 9652bb888b5937390cc183a96ff7ebf5a4fa2426..d79ba6d291950e1f089eb11713bd1c3e4d154b27 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include "grpc_client.h"
-#include
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+#include
+
#include "paddle/fluid/framework/threadpool.h"
namespace paddle {
@@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
call->StartCall();
- call->Finish(&s->reply_, &s->status_, (void*)s);
+ call->Finish(&s->reply_, &s->status_, static_cast(s));
});
req_count_++;
@@ -70,8 +72,7 @@ void ProcGetResponse(const VarHandle& var_h,
template
void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
::grpc::Slice slice(proto.ByteSizeLong());
- proto.SerializeWithCachedSizesToArray(
- const_cast(reinterpret_cast(slice.begin())));
+ proto.SerializeWithCachedSizesToArray(const_cast(slice.begin()));
::grpc::ByteBuffer tmp(&slice, 1);
result->Swap(&tmp);
}
@@ -109,7 +110,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
call->StartCall();
- call->Finish(&s->reply_, &s->status_, (void*)s);
+ call->Finish(&s->reply_, &s->status_, static_cast(s));
});
req_count_++;
@@ -150,9 +151,10 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall(
- s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+ s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+ &cq_);
call->StartCall();
- call->Finish(&s->reply_, &s->status_, (void*)s);
+ call->Finish(&s->reply_, &s->status_, static_cast(s));
});
req_count_++;
@@ -168,7 +170,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
- rpc->Finish(&s->reply_, &s->status_, (void*)s);
+ rpc->Finish(&s->reply_, &s->status_, static_cast(s));
req_count_++;
}
@@ -180,7 +182,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
- rpc->Finish(&s->reply_, &s->status_, (void*)s);
+ rpc->Finish(&s->reply_, &s->status_, static_cast(s));
req_count_++;
}
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 109c762e74906440fe1d5091270ac878a054c9f2..7c978b28b6873d05afb435de4caf7f4ce5d33193 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -14,6 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_server.h"
+#include
+#include
+
using ::grpc::ServerAsyncResponseWriter;
namespace paddle {
@@ -128,6 +131,49 @@ class RequestGet final : public RequestBase {
SimpleBlockQueue* queue_;
};
+class RequestPrefetch final : public RequestBase {
+ public:
+ explicit RequestPrefetch(GrpcService::AsyncService* service,
+ ::grpc::ServerCompletionQueue* cq,
+ framework::Scope* scope,
+ const platform::DeviceContext* dev_ctx,
+ framework::Executor* executor,
+ framework::ProgramDesc* program, int blkid)
+ : RequestBase(service, cq, dev_ctx),
+ responder_(&ctx_),
+ scope_(scope),
+ executor_(executor),
+ program_(program),
+ blkid_(blkid) {
+ int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable);
+ service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+ cq_, this);
+ }
+
+ virtual ~RequestPrefetch() {}
+
+ virtual std::string GetReqName() { return request_.varname(); }
+
+ virtual void Process() {
+ // prefetch process...
+ ::grpc::ByteBuffer reply;
+ // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+ VLOG(3) << "RequestPrefetch Process in";
+
+ responder_.Finish(reply, ::grpc::Status::OK, this);
+ status_ = FINISH;
+ }
+
+ protected:
+ sendrecv::VariableMessage request_;
+ ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+ framework::Scope* scope_;
+ framework::Executor* executor_;
+ framework::ProgramDesc* program_;
+ int blkid_;
+};
+
void AsyncGRPCServer::WaitClientGet(int count) {
int fetch_barriers = 0;
while (fetch_barriers < count) {
@@ -147,6 +193,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
cq_send_ = builder.AddCompletionQueue();
cq_get_ = builder.AddCompletionQueue();
+ cq_prefetch_ = builder.AddCompletionQueue();
server_ = builder.BuildAndStart();
LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -155,6 +202,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
std::function get_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+ std::function prefetch_register =
+ std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
t_send_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,17 +212,21 @@ void AsyncGRPCServer::RunSyncUpdate() {
t_get_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_get_.get(), "cq_get", get_register)));
-
+ t_prefetch_.reset(new std::thread(
+ std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+ "cq_prefetch", prefetch_register)));
// wait server
server_->Wait();
t_send_->join();
t_get_->join();
+ t_prefetch_->join();
}
void AsyncGRPCServer::ShutdownQueue() {
std::unique_lock lock(cq_mutex_);
cq_send_->Shutdown();
cq_get_->Shutdown();
+ cq_prefetch_->Shutdown();
}
// This URL explains why shutdown is complicate:
@@ -186,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
void AsyncGRPCServer::TryToRegisterNewSendOne() {
std::unique_lock lock(cq_mutex_);
if (is_shut_down_) {
+ VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
return;
}
RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -196,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
void AsyncGRPCServer::TryToRegisterNewGetOne() {
std::unique_lock lock(cq_mutex_);
if (is_shut_down_) {
+ VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
return;
}
RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -203,6 +258,19 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
VLOG(4) << "Create RequestGet status:" << get->Status();
}
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+ std::unique_lock lock(cq_mutex_);
+ if (is_shut_down_) {
+ VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
+ return;
+ }
+ RequestPrefetch* prefetch =
+ new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+ executor_, program_, prefetch_blk_id_);
+
+ VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
// FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
std::string cq_name,
@@ -211,25 +279,28 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
void* tag = NULL;
bool ok = false;
+
while (true) {
+ VLOG(3) << "HandleRequest for " << cq_name << " while in";
if (!cq->Next(&tag, &ok)) {
LOG(INFO) << cq_name << " CompletionQueue shutdown!";
break;
}
+ VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
PADDLE_ENFORCE(tag);
// FIXME(typhoonzero): de-couple the barriers with recv_op
if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
- RequestBase* base = (RequestBase*)tag;
+ RequestBase* base = reinterpret_cast(tag);
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
// https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
// https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
if (!ok) {
- LOG(WARNING) << cq_name << " recv no regular event:argument name"
- << base->GetReqName();
+ LOG(WARNING) << cq_name << " recv no regular event:argument name["
+ << base->GetReqName() << "]";
TryToRegisterNewOne();
delete base;
continue;
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a901d36de4a6577db4da05551645eb73..b0596d3cd1e108f28e8f1485d6b5c989c55be7e9 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -15,9 +15,12 @@ limitations under the License. */
#pragma once
#include
-#include
+#include
+#include
+#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
@@ -53,6 +56,12 @@ class AsyncGRPCServer final {
void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
+ void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+ void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+ void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
void Push(const std::string &msg_name) {
@@ -66,6 +75,7 @@ class AsyncGRPCServer final {
std::function TryToRegisterNewOne);
void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne();
+ void TryToRegisterNewPrefetchOne();
void ShutdownQueue();
private:
@@ -73,6 +83,7 @@ class AsyncGRPCServer final {
volatile bool is_shut_down_ = false;
std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+ std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
GrpcService::AsyncService service_;
std::unique_ptr<::grpc::Server> server_;
@@ -83,6 +94,7 @@ class AsyncGRPCServer final {
// received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue var_get_queue_;
+ // client send variable to this queue.
ReceivedQueue var_recv_queue_;
// condition of the sub program
@@ -92,6 +104,11 @@ class AsyncGRPCServer final {
std::unique_ptr t_send_;
std::unique_ptr t_get_;
+ std::unique_ptr t_prefetch_;
+
+ int prefetch_blk_id_;
+ framework::ProgramDesc *program_;
+ framework::Executor *executor_;
};
}; // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ad62863a1a98c28cb08f47dfa8a5bfae463ba91
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+ rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+ rpc_service_->RunSyncUpdate();
+}
+
+TEST(PREFETCH, CPU) {
+ // start up a server instance backend
+ // TODO(Yancey1989): Need to start a server with optimize blocks and
+ // prefetch blocks.
+ std::thread server_thread(StartServer, "127.0.0.1:8889");
+ framework::Scope scope;
+ platform::CPUPlace place;
+ platform::CPUDeviceContext ctx(place);
+ // create var on local scope
+ std::string in_var_name("in");
+ std::string out_var_name("out");
+ auto* in_var = scope.Var(in_var_name);
+ auto* in_tensor = in_var->GetMutable();
+ in_tensor->Resize({10, 10});
+ VLOG(3) << "before mutable_data";
+ in_tensor->mutable_data(place);
+
+ scope.Var(out_var_name);
+
+ VLOG(3) << "before fetch";
+ detail::RPCClient client;
+ client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+ out_var_name);
+ client.Wait();
+
+ rpc_service_->ShutDown();
+ server_thread.join();
+ rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd31a4b4839b34e8e53dd87f1ecf4b1d..e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,10 +76,11 @@ namespace detail {
enum class GrpcMethod {
kSendVariable,
kGetVariable,
+ kPrefetchVariable,
};
static const int kGrpcNumMethods =
- static_cast(GrpcMethod::kGetVariable) + 1;
+ static_cast(GrpcMethod::kPrefetchVariable) + 1;
inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) {
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/SendVariable";
case GrpcMethod::kGetVariable:
return "/sendrecv.SendRecvService/GetVariable";
+ case GrpcMethod::kPrefetchVariable:
+ return "/sendrecv.SendRecvService/PrefetchVariable";
}
// Shouldn't be reached.
@@ -114,5 +117,5 @@ class GrpcService final {
};
} // namespace detail
-} // namespace operator
+} // namespace operators
} // namespace paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e45c51d9a3812b2391381f74d6fddb29..fc12e82a7e6bd10262092d1ca367980df64e91c2 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+ // Prefetch variable by Ids
+ rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
}
// VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 11eab6f78f983460c719e144f071da3919cf69ea..91a1f226cd0c96f675bdd59dca809c43b0cedd4f 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include
-#include
#include
-#include
-
-#include
#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace operators {
@@ -121,6 +112,11 @@ class ListenAndServOp : public framework::OperatorBase {
prepared.begin(),
std::shared_ptr(nullptr));
+ // TODO(qiao) set proper fields for table lookup and update
+ rpc_service_->SetExecutor(&executor);
+ rpc_service_->SetPrefetchBlkdId(0);
+ rpc_service_->SetProgram(program);
+
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
// Record received sparse variables, so that
@@ -184,7 +180,8 @@ class ListenAndServOp : public framework::OperatorBase {
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
- VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+ VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+ << "(ms)";
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..deabcdc99f819851b2df9bb0c7b05a5b339568f3 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,6 +18,22 @@ limitations under the License. */
namespace paddle {
namespace operators {
+static inline framework::OpKernelType ExpectedKernelType(
+ const framework::ExecutionContext& ctx) {
+ auto* table_var = ctx.InputVar("W");
+ if (table_var->IsType()) {
+ return framework::OpKernelType(
+ framework::ToDataType(table_var->Get().type()),
+ ctx.device_context());
+ } else if (table_var->IsType()) {
+ return framework::OpKernelType(
+ framework::ToDataType(table_var->Get().value().type()),
+ ctx.device_context());
+ } else {
+ PADDLE_THROW("W should be LoDTensor or SelectedRows");
+ }
+}
+
class LookupTableOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
@@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::OpKernelType(
- framework::ToDataType(ctx.Input("W")->type()),
- ctx.device_context());
+ return ExpectedKernelType(ctx);
}
};
@@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"If the value is -1, it makes no effect to lookup. "
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
- .SetDefault(-1);
+ .SetDefault(kNoPadding);
AddComment(R"DOC(
Lookup Table Operator.
@@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::OpKernelType(
- framework::ToDataType(ctx.Input("W")->type()),
- ctx.device_context());
+ return ExpectedKernelType(ctx);
}
};
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eeffb8f1517e61c6d6624d406e04d974d..fff5edda62d4b115605a4cab35ed5457b4db5f21 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
+#include
+#include
+
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,37 @@ namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static constexpr int64_t kNoPadding = -1;
+
+inline size_t getIndex(const std::vector &rows, int64_t value) {
+ auto it = std::find(rows.begin(), rows.end(), value);
+ PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
+ return static_cast(std::distance(rows.begin(), it));
+}
template
class LookupTableKernel : public framework::OpKernel {
public:
- void Compute(const framework::ExecutionContext& context) const override {
- auto* table_t = context.Input("W");
- auto* ids_var = context.InputVar("Ids");
- Tensor* output_t = context.Output("Out");
+ void Compute(const framework::ExecutionContext &context) const override {
+ auto *table_var = context.InputVar("W");
+ auto *ids_var = context.InputVar("Ids");
+ Tensor *output_t = context.Output("Out");
+ int64_t padding_idx = context.Attr("padding_idx");
+
+ DDim table_dim;
- int64_t* ids;
+ if (table_var->IsType()) {
+ table_dim = context.Input("W")->dims();
+ } else if (table_var->IsType()) {
+ auto *table_t = context.Input("W");
+ table_dim = table_t->value().dims();
+ } else {
+ PADDLE_THROW("table only support LoDTensor and SelectedRows");
+ }
+
+ int64_t *ids;
int64_t ids_numel;
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel {
// when Ids's type is SelectedRows, the rows of Ids contains the
// ids to be looked up in W.
if (ids_var->IsType()) {
- auto* ids_t = context.Input("Ids");
- ids = const_cast(ids_t->data());
+ auto *ids_t = context.Input("Ids");
+ ids = const_cast(ids_t->data());
ids_numel = ids_t->numel();
} else if (ids_var->IsType()) {
- auto* ids_t = context.Input("Ids");
- ids = const_cast(ids_t->rows().data());
+ auto *ids_t = context.Input("Ids");
+ ids = const_cast(ids_t->rows().data());
ids_numel = ids_t->rows().size();
- output_t->Resize({ids_numel, table_t->dims()[1]});
+ output_t->Resize({ids_numel, table_dim[1]});
} else {
PADDLE_THROW("Unsupported Variable Type of Ids");
}
- int64_t padding_idx = context.Attr("padding_idx");
+ if (table_var->IsType()) {
+ auto *table_t = context.Input("W");
+ int64_t row_number = table_t->dims()[0];
+ int64_t row_width = table_t->dims()[1];
- int N = table_t->dims()[0];
- int D = table_t->dims()[1];
- auto* table = table_t->data();
- auto* output = output_t->mutable_data(context.GetPlace());
+ auto *table = table_t->data();
+ auto *output = output_t->mutable_data(context.GetPlace());
- if (padding_idx == -1) {
for (int64_t i = 0; i < ids_numel; ++i) {
- PADDLE_ENFORCE_LT(ids[i], N);
- PADDLE_ENFORCE_GE(ids[i], 0);
- memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+ if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+ memset(output + i * row_width, 0, row_width * sizeof(T));
+ } else {
+ PADDLE_ENFORCE_LT(ids[i], row_number);
+ PADDLE_ENFORCE_GE(ids[i], 0);
+ memcpy(output + i * row_width, table + ids[i] * row_width,
+ row_width * sizeof(T));
+ }
}
- } else {
+ } else if (table_var->IsType()) {
+ const auto &table_t = table_var->Get();
+ int64_t row_width = table_t.value().dims()[1];
+ const auto *table = table_t.value().data();
+ auto *output = output_t->mutable_data(context.GetPlace());
+
for (int64_t i = 0; i < ids_numel; ++i) {
- if (ids[i] == padding_idx) {
- memset(output + i * D, 0, D * sizeof(T));
+ if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+ memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
- PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0);
- memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+ auto id_index = getIndex(table_t.rows(), ids[i]);
+ memcpy(output + i * row_width, table + id_index * row_width,
+ row_width * sizeof(T));
}
}
}
@@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel {
template
class LookupTableGradKernel : public framework::OpKernel {
public:
- void Compute(const framework::ExecutionContext& context) const override {
+ void Compute(const framework::ExecutionContext &context) const override {
+ auto *table_var = context.InputVar("W");
+ DDim table_dim;
+ if (table_var->IsType()) {
+ table_dim = context.Input("W")->dims();
+ } else if (table_var->IsType()) {
+ auto *table_t = context.Input("W");
+ table_dim = table_t->value().dims();
+ } else {
+ PADDLE_THROW("table only support LoDTensor and SelectedRows");
+ }
+
bool is_sparse = context.Attr("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
- auto* ids = context.Input("Ids");
- auto* table = context.Input("W");
- auto* d_output = context.Input(framework::GradVarName("Out"));
- auto* d_table = context.Output(framework::GradVarName("W"));
+ auto *ids = context.Input("Ids");
+ auto *d_output = context.Input(framework::GradVarName("Out"));
+ auto *d_table = context.Output(framework::GradVarName("W"));
- auto* ids_data = ids->data();
+ auto *ids_data = ids->data();
auto ids_dim = ids->dims();
framework::Vector new_rows;
@@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel {
}
d_table->set_rows(new_rows);
- auto* d_table_value = d_table->mutable_value();
- d_table_value->Resize({ids_dim[0], table->dims()[1]});
+ auto *d_table_value = d_table->mutable_value();
+ d_table_value->Resize({ids_dim[0], table_dim[1]});
d_table_value->mutable_data(context.GetPlace());
- d_table->set_height(table->dims()[0]);
+ d_table->set_height(table_dim[0]);
- auto* d_output_data = d_output->data();
- auto* d_table_data = d_table_value->data();
+ auto *d_output_data = d_output->data();
+ auto *d_table_data = d_table_value->data();
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
} else {
- auto* ids = context.Input("Ids");
- auto* d_output = context.Input(framework::GradVarName("Out"));
- auto* d_table = context.Output(framework::GradVarName("W"));
- auto* table = context.Input("W");
+ auto *ids = context.Input("Ids");
+ auto *d_output = context.Input(framework::GradVarName("Out"));
+ auto *d_table = context.Output(framework::GradVarName("W"));
- auto* ids_data = ids->data();
+ auto *ids_data = ids->data();
auto ids_dim = ids->dims();
- int N = table->dims()[0];
+ int N = table_dim[0];
int D = d_output->dims()[1];
- auto* d_output_data = d_output->data();
- auto* d_table_data = d_table->mutable_data(context.GetPlace());
+ auto *d_output_data = d_output->data();
+ auto *d_table_data = d_table->mutable_data(context.GetPlace());
memset(d_table_data, 0, d_table->numel() * sizeof(T));
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955cea51ded2dbb2bde459113458d7749a4..a31d64e899df33f16f707e96d7ff7b85eca8d6ea 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
TEST_F(NCCLTester, ncclInitOp) {}
// ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
TEST_F(NCCLTester, ncclAllReduceOp) {
std::unique_ptr op2(new f::OpDesc);
op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
}
}
}
+*/
// ncclReduceOp with desc
TEST_F(NCCLTester, ncclReduceOp) {
@@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
}
// ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr op2(new f::OpDesc);
const int kRoot = 0;
@@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}
+*/
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 141a3eb93555c32efabc2465dc6daadf41c9d659..f9a8058f2a32b6736d6513b017b761a31ddc2e37 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -20,12 +20,29 @@ namespace paddle {
namespace operators {
namespace reader {
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 0; // kCacheSize - 2
class DoubleBufferReader : public framework::DecoratedReader {
public:
struct Item {
Item() : ctx_(nullptr) {}
+ Item(Item&& b) {
+ payloads_ = std::move(b.payloads_);
+ ctx_ = std::move(b.ctx_);
+ }
+ Item& operator=(Item&& b) {
+ payloads_ = std::move(b.payloads_);
+ ctx_ = std::move(b.ctx_);
+ return *this;
+ }
std::vector payloads_;
platform::DeviceContext* ctx_;
@@ -34,42 +51,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
explicit DoubleBufferReader(
ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
: DecoratedReader(reader), place_(target_place) {
- for (size_t i = 0; i < kDoubleBufferSize; ++i) {
- if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
+ for (size_t i = 0; i < kCacheSize; ++i) {
+ if (platform::is_gpu_place(place_)) {
ctxs_.emplace_back(new platform::CUDADeviceContext(
boost::get(place_)));
-#endif
}
}
-
- start_thread();
- }
-
- void start_thread() {
- buffer_ = framework::MakeChannel- (kDoubleBufferSize);
- prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+ StartPrefetcher();
}
+ bool HasNext() const override;
void ReadNext(std::vector* out) override;
void ReInit() override;
- ~DoubleBufferReader() {
- buffer_->Close();
- prefetcher_.join();
- delete buffer_;
+ ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
+ void StartPrefetcher() {
+ channel_ = framework::MakeChannel
- (kChannelSize);
+ prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
}
- bool HasNext() const override;
+ void EndPrefetcher() {
+ channel_->Close();
+ if (prefetcher_.joinable()) {
+ prefetcher_.join();
+ }
+ delete channel_;
+ channel_ = nullptr;
+ }
- private:
void PrefetchThreadFunc();
std::thread prefetcher_;
- framework::Channel
- * buffer_;
+ framework::Channel
- * channel_;
platform::Place place_;
std::vector> ctxs_;
- mutable Item local_buffer_;
};
class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -123,70 +142,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
}
};
+bool DoubleBufferReader::HasNext() const {
+ while (!channel_->IsClosed() && !channel_->CanReceive()) {
+ }
+ return channel_->CanReceive();
+}
+
void DoubleBufferReader::ReadNext(std::vector* out) {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
- if (local_buffer_.payloads_.empty()) {
- buffer_->Receive(&local_buffer_);
- }
- *out = local_buffer_.payloads_;
- local_buffer_.payloads_.clear();
- if (local_buffer_.ctx_) {
- local_buffer_.ctx_->Wait();
+ Item batch;
+ channel_->Receive(&batch);
+ *out = batch.payloads_;
+ if (batch.ctx_) {
+ batch.ctx_->Wait();
}
}
void DoubleBufferReader::ReInit() {
reader_->ReInit();
- buffer_->Close();
- prefetcher_.join();
- delete buffer_;
- start_thread();
+ EndPrefetcher();
+ StartPrefetcher();
}
void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts.";
- size_t gpu_ctx_offset = 0;
+ std::vector> cpu_tensor_cache(kCacheSize);
+ std::vector> gpu_tensor_cache(kCacheSize);
+ size_t cached_tensor_id = 0;
+
while (reader_->HasNext()) {
Item batch;
- reader_->ReadNext(&batch.payloads_);
+ auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+ reader_->ReadNext(&cpu_batch);
if (platform::is_gpu_place(place_)) {
- std::vector gpu_batch;
- auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
- gpu_ctx_offset %= this->ctxs_.size();
- gpu_batch.resize(batch.payloads_.size());
- for (size_t i = 0; i < batch.payloads_.size(); ++i) {
- framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
- &gpu_batch[i]);
- gpu_batch[i].set_lod(batch.payloads_[i].lod());
+ auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+ auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+ gpu_batch.resize(cpu_batch.size());
+ for (size_t i = 0; i < cpu_batch.size(); ++i) {
+ framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+ gpu_batch[i].set_lod(cpu_batch[i].lod());
}
- batch.ctx_ = gpu_ctx.get();
- std::swap(gpu_batch, batch.payloads_);
+ batch.payloads_ = gpu_batch;
+ batch.ctx_ = gpu_ctx;
+ } else {
+ // CPUPlace
+ batch.payloads_ = cpu_batch;
}
+ ++cached_tensor_id;
+ cached_tensor_id %= kCacheSize;
try {
- buffer_->Send(&batch);
+ channel_->Send(&batch);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread will terminate.";
break;
}
}
- buffer_->Close();
+ channel_->Close();
VLOG(5) << "Prefetch thread terminates.";
}
-bool DoubleBufferReader::HasNext() const {
- if (local_buffer_.payloads_.empty()) {
- bool ok = buffer_->Receive(&local_buffer_);
- return ok;
- } else {
- return true;
- }
-}
-
} // namespace reader
} // namespace operators
} // namespace paddle
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641cc3d5178ff090e05437484d395bfe51..b87b8e6b26cdeb017e700870998a53c1b295988c 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -17,90 +17,66 @@ limitations under the License. */
namespace paddle {
namespace operators {
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
- ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
- const framework::VariableNameMap &outputs,
- const framework::AttributeMap &attrs)
- : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
- void InferShape(framework::InferShapeContext *ctx) const override {
- // input check
- PADDLE_ENFORCE(ctx->HasInput("X"),
- "Input(X) of ReshapeOp should not be null.");
- PADDLE_ENFORCE(ctx->HasOutput("Out"),
- "Output(Out) of ReshapeOp should not be null.");
-
- auto shape = ctx->Attrs().Get>("shape");
- PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
- auto x_dims = ctx->GetInputDim("X");
-
- std::vector neg_dims_idx;
- // set some dimension to -1 if it is unknown
- const int unknown_size = -1;
- for (size_t i = 0; i < shape.size(); ++i) {
- PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
- "Each dimension of Attr(shape) must be positive or %d.",
- unknown_size);
- if (shape[i] == unknown_size) {
- neg_dims_idx.push_back(i);
- PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
- "Only one dimension of Attr(shape) can be unknown.");
- }
- }
-
- int64_t capacity =
- std::accumulate(shape.begin(), shape.end(), 1, std::multiplies());
- int64_t in_size = framework::product(x_dims);
- if (neg_dims_idx.size() == 1) {
- // dim infer
- shape[neg_dims_idx[0]] = in_size / (-capacity);
- // recalculate capacity
- capacity = shape[neg_dims_idx[0]] * (-capacity);
- }
- // capacity check
- PADDLE_ENFORCE(capacity == in_size,
- "The size of Input(X) mismatches with Attr(shape).");
- // resize output
- std::vector shape_int64(shape.size(), 0);
- std::transform(shape.begin(), shape.end(), shape_int64.begin(),
- [](int a) { return static_cast(a); });
- auto out_dims = framework::make_ddim(shape_int64);
- ctx->SetOutputDim("Out", out_dims);
- if (shape[0] == x_dims[0]) {
- // Only pass LoD when the first dimension is equal between
- // output and input.
- ctx->ShareLoD("X", /*->*/ "Out");
- }
- }
-};
-
class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "The input tensor of reshape operator.");
- AddOutput("Out", "The output tensor of reshape operator.");
- AddAttr>("shape",
- "(vector) "
- "Target shape of reshape operator.");
+ AddInput("X", "(Tensor). The input tensor of reshape operator.");
+ AddInput("Shape",
+ "(Tensor, optional). If provided, reshape according to "
+ "this given shape. That is to say it has a higher priority than "
+ "the shape attribute, while the shape attribute still should be "
+ "set correctly to gurantee shape inference in compile time.")
+ .AsDispensable();
+ AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+ AddAttr>(
+ "shape", "(std::vector) Target shape of reshape operator.");
AddAttr("inplace",
- "Change the source tensor's shape without copy memory.")
- .SetDefault(true);
+ "(default: false) Change the source tensor's shape without "
+ "memory copy. When Attr(inplace) is set true, the output "
+ "tensor shares memory with Input(X), otherwise, a new output "
+ "tensor is created, and its data are copied from Input(x).")
+ .SetDefault(false);
AddComment(R"DOC(
Reshape Operator.
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
+
+Examples:
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in
+compile-time.
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from
-the original shape of Input(X) and other dimensions in the target shape.
)DOC");
}
};
@@ -119,6 +95,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
"Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext &ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
+ }
};
} // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index eacb0a0cf21a60ffbdef5787434859ac549388bc..871b4d38d56f10f3c0c178caa566508ab75f316c 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -20,17 +20,129 @@ limitations under the License. */
namespace paddle {
namespace operators {
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+ ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ void InferShape(framework::InferShapeContext *ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of ReshapeOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"),
+ "Output(Out) of ReshapeOp should not be null.");
+
+ const std::vector &shape = ctx->Attrs().Get>("shape");
+ PADDLE_ENFORCE(!shape.empty(),
+ "The shape information must be set by Attr(shape).");
+
+ if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+ // If true, set the shape of Output(Out) according to Input(Shape) in
+ // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+ ctx->ShareLoD("X", /*->*/ "Out");
+ return;
+ }
+
+ auto x_dims = ctx->GetInputDim("X");
+ auto out_dims = ValidateShape(shape, x_dims);
+ ctx->SetOutputDim("Out", out_dims);
+ if (x_dims[0] == out_dims[0]) {
+ // Only pass LoD when the first dimension of output and Input(X)
+ // are the same.
+ ctx->ShareLoD("X", /*->*/ "Out");
+ }
+ }
+
+ static framework::DDim ValidateShape(const std::vector shape,
+ const framework::DDim &in_dims) {
+ const int64_t in_size = framework::product(in_dims);
+ // only one dimension canbe set to -1, whose size will be automatically
+ // infered.
+ const int64_t unk_dim_val = -1;
+ const int64_t copy_dim_val = 0;
+
+ std::vector output_shape(shape.size(), 0);
+ int64_t capacity = 1;
+ int unk_dim_idx = -1;
+ for (size_t i = 0; i < shape.size(); ++i) {
+ if (shape[i] == unk_dim_val) {
+ PADDLE_ENFORCE(
+ unk_dim_idx == -1,
+ "Only one input dimension of Attr(shape) can be unknown.");
+ unk_dim_idx = i;
+ } else if (shape[i] == copy_dim_val) {
+ PADDLE_ENFORCE(
+ static_cast(i) < in_dims.size(),
+ "The index of dimension to copy from input shape must be less "
+ "than the size of input shape.");
+ } else {
+ PADDLE_ENFORCE(
+ shape[i] > 0,
+ "Each input dimension of Attr(shape) must not be negtive except "
+ "one unknown dimension.");
+ }
+
+ capacity *= (shape[i] ? shape[i] : in_dims[i]);
+ output_shape[i] =
+ (shape[i] ? static_cast(shape[i]) : in_dims[i]);
+ }
+
+ if (unk_dim_idx != -1) {
+ output_shape[unk_dim_idx] = -in_size / capacity;
+ PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+ "Invalid shape is given.");
+ } else {
+ PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+ }
+ return framework::make_ddim(output_shape);
+ }
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext &ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
+ }
+};
+
template
class ReshapeKernel : public framework::OpKernel {
public:
- void Compute(const framework::ExecutionContext& ctx) const {
- auto* out = ctx.Output("Out");
- auto* in = ctx.Input("X");
+ void Compute(const framework::ExecutionContext &ctx) const {
+ auto *out = ctx.Output("Out");
+ auto *in = ctx.Input("X");
+ auto *shape_tensor = ctx.Input("Shape");
+
+ framework::DDim out_dims = out->dims();
+ if (shape_tensor) {
+ auto *shape_data = shape_tensor->data();
+ if (platform::is_gpu_place(ctx.GetPlace())) {
+ framework::Tensor cpu_shape_tensor;
+ TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+ &cpu_shape_tensor);
+ shape_data = cpu_shape_tensor.data();
+ }
+ auto shape =
+ std::vector(shape_data, shape_data + shape_tensor->numel());
+ out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+ }
+ if (!in->lod().empty()) {
+ PADDLE_ENFORCE_EQ(
+ out_dims[0], in->dims()[0],
+ "Reshape operator cannot reshape an input sequence batch "
+ "into an output sequence batch that has a different "
+ "number of time steps. Please consider using "
+ "sequence_reshape op.");
+ }
+
bool inplace = ctx.Attr("inplace");
- auto out_dims = out->dims();
+ out->Resize(out_dims);
if (!inplace) {
out->mutable_data(ctx.GetPlace());
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+ // TensorCopy will resize to in_dims.
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
@@ -42,9 +154,10 @@ class ReshapeKernel : public framework::OpKernel {
template
class ReshapeGradKernel : public framework::OpKernel {
public:
- void Compute(const framework::ExecutionContext& ctx) const {
- auto* d_out = ctx.Input(framework::GradVarName("Out"));
- auto* d_x = ctx.Output(framework::GradVarName("X"));
+ void Compute(const framework::ExecutionContext &ctx) const {
+ auto *d_out = ctx.Input(framework::GradVarName("Out"));
+ auto *d_x = ctx.Output(framework::GradVarName("X"));
+
d_x->mutable_data(ctx.GetPlace());
bool inplace = ctx.Attr("inplace");
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e1b1bbec97985aa839c62a0a82b81b020faf0008..b0a3f06a8871b1dc8c6c9d7231dfe2c9764ade3f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -504,10 +504,10 @@ All parameter, weight, gradient are variables in Paddle.
const std::unordered_set ¶ms,
const ProgramDesc &startup_program,
const ProgramDesc &main_program, const std::string &loss_var_name,
- Scope *scope) {
+ Scope *scope, bool allow_op_delay) {
new (&self) ParallelExecutor(num_threads, use_event, places,
params, startup_program, main_program,
- loss_var_name, scope);
+ loss_var_name, scope, allow_op_delay);
})
.def("run", &ParallelExecutor::Run);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index f916295cd7bc762e2052553b321344845f504648..4885b74e6c6644704cff01dbf49975d6e87ce0c4 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -104,7 +104,9 @@ EOF
# make install should also be test when unittest
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
- paddle version
+ if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+ paddle version
+ fi
fi
}
@@ -183,6 +185,14 @@ EOF
NCCL_DEPS=""
fi
+ if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+ PADDLE_VERSION="paddle version"
+ CMD='"paddle", "version"'
+ else
+ PADDLE_VERSION="true"
+ CMD='"true"'
+ fi
+
cat >> /paddle/build/Dockerfile <= 0:
orig_var_name = v.name[:suff_idx]
- pserver_program.global_block().create_var(
+ else:
+ orig_var_name = v.name
+ single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
- for trainer_id in xrange(self.trainers):
- var = pserver_program.global_block().create_var(
- name="%s.trainer_%d" % (orig_var_name, trainer_id),
- persistable=False,
- type=v.type,
- dtype=v.dtype,
- shape=v.shape)
- recv_inputs.append(var)
+ if self.trainers > 1:
+ for trainer_id in xrange(self.trainers):
+ var = pserver_program.global_block().create_var(
+ name="%s.trainer_%d" % (orig_var_name, trainer_id),
+ persistable=False,
+ type=v.type,
+ dtype=v.dtype,
+ shape=v.shape)
+ recv_inputs.append(var)
+ else:
+ recv_inputs.append(single_trainer_var)
# step3
optimize_block = pserver_program.create_block(0)
@@ -511,8 +516,11 @@ class DistributeTranspiler:
def _append_split_op(self, program, gradblocks):
# Split variables that need to be split and append respective ops
+ add_suffix = False
+ if self.trainers > 1:
+ add_suffix = True
var_mapping = self._create_vars_from_blocklist(
- program, gradblocks, add_trainer_suffix=True)
+ program, gradblocks, add_trainer_suffix=add_suffix)
for varname, splited_vars in var_mapping.iteritems():
# variable that don't need to split have empty splited_vars
if len(splited_vars) <= 1:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae41986ae0d5c6e942cc3accebcb00e19..54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
assert isinstance(tensor, core.LoDTensor)
lod = tensor.lod()
if len(lod) > 0:
- raise RuntimeError(
- "Some of your featched tensors hold LoD information. \
+ raise RuntimeError("Some of your fetched tensors hold LoD information. \
They can not be completely cast to Python ndarray. \
Please set the parameter 'return_numpy' as 'False' to \
return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
class Executor(object):
- def __init__(self, places):
- if not isinstance(places, list) and not isinstance(places, tuple):
- places = [places]
-
- act_places = []
- for each in places:
- p = core.Place()
- p.set_place(each)
- act_places.append(p)
-
- # TODO(dzhwinter) : only use the first place
- self.executor = core.Executor(act_places[0])
- self.places = places
+ def __init__(self, place):
+ self.place = place
+ p = core.Place()
+ p.set_place(place)
+ self.executor = core.Executor(p)
self.program_caches = dict()
- def aslodtensor(self, data):
- def accumulate(data):
- if not isinstance(data, list):
- return 1
- return sum([accumulate(sub) for sub in data])
-
- def parselod(data):
- seq_lens = [accumulate(seq) for seq in data]
- cur_len = 0
- lod = [cur_len]
- for l in seq_lens:
- cur_len += l
- lod.append(cur_len)
- return lod
-
- assert len(self.places) != 0
- if not isinstance(data, list):
- # pure tensor case
- tensor = core.LoDTensor()
- tensor.set(data, self.places[0])
- return tensor
- else:
- raise RuntimeError("Current implementation lacks unittests")
- # lodtensor case
- lod = []
- if not isinstance(data[0], list):
- lod.append(parselod(data))
- flattened_data = np.concatenate(data, axis=0).astype("int64")
- else:
- while isinstance(data[0], list):
- lod.append(parselod(seq))
- flattened_data = [item for seq in data for item in seq]
- data = flattened_data
- flattened_data = np.concatenate(data, axis=0).astype("int64")
- flattened_data = flattened_data.reshape([len(flattened_data), 1])
- tensor = core.LoDTensor()
- tensor.set(flattened_data, self.places[0])
- tensor.set_lod(lod)
- return tensor
+ def as_lodtensor(self, data):
+ if isinstance(data, list):
+ raise RuntimeError("Some of your feed data hold LoD information. \
+ They can not be completely cast from a list of Python \
+ ndarray to LoDTensor. Please convert data to LoDTensor \
+ directly before feeding the data.\
+ ")
+ # single tensor case
+ tensor = core.LoDTensor()
+ tensor.set(data, self.place)
+ return tensor
def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor):
- cur_feed = self.aslodtensor(cur_feed)
+ cur_feed = self.as_lodtensor(cur_feed)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3e649dc5fd32c4ed8fa6ad273b7be04d552b51ae..a5938fe494265778ef7032c56a8d6d35acd729c5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
from layer_function_generator import autodoc
from ..layer_helper import LayerHelper
import tensor
-import ops
import nn
import math
@@ -58,7 +57,7 @@ def detection_output(loc,
This operation is to get the detection results by performing following
two steps:
-
+
1. Decode input bounding box predictions according to the prior boxes.
2. Get the final detection results by applying multi-class non maximum
suppression (NMS).
@@ -130,9 +129,9 @@ def detection_output(loc,
target_box=loc,
code_type='decode_center_size')
old_shape = scores.shape
- scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+ scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
scores = nn.softmax(input=scores)
- scores = ops.reshape(x=scores, shape=old_shape)
+ scores = nn.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -463,7 +462,7 @@ def ssd_loss(location,
num, num_prior, num_class = confidence.shape
def __reshape_to_2d(var):
- return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+ return nn.reshape(x=var, shape=[-1, var.shape[-1]])
# 1. Find matched boundding box by prior box.
# 1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -474,7 +473,7 @@ def ssd_loss(location,
# 2. Compute confidence for mining hard examples
# 2.1. Get the target label based on matched indices
- gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+ gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
gt_label.stop_gradient = True
target_label, _ = target_assign(
gt_label, matched_indices, mismatch_value=background_label)
@@ -487,7 +486,7 @@ def ssd_loss(location,
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples
- conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+ conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
conf_loss.stop_gradient = True
neg_indices = helper.create_tmp_variable(dtype='int32')
dtype = matched_indices.dtype
@@ -556,7 +555,7 @@ def ssd_loss(location,
# 5.3 Compute overall weighted loss.
loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
# reshape to [N, Np], N is the batch size and Np is the prior box number.
- loss = ops.reshape(x=loss, shape=[-1, num_prior])
+ loss = nn.reshape(x=loss, shape=[-1, num_prior])
loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
if normalize:
normalizer = nn.reduce_sum(target_loc_weight)
@@ -709,7 +708,7 @@ def multi_box_head(inputs,
new_shape = [
-1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
]
- out = ops.reshape(x=input, shape=new_shape)
+ out = nn.reshape(x=input, shape=new_shape)
return out
def _is_list_or_tuple_(data):
@@ -803,7 +802,7 @@ def multi_box_head(inputs,
mbox_loc.shape[0],
mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
]
- mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+ mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
mbox_locs.append(mbox_loc_flatten)
# get conf
@@ -819,7 +818,7 @@ def multi_box_head(inputs,
conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
conf_loc.shape[3] / num_classes, num_classes
]
- conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+ conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
mbox_confs.append(conf_loc_flatten)
if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62c46b187bd79841e4969d9da08b57a5..3d13133bf25aa3f538f6f574bd2ae682e1bc7e39 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,8 +73,10 @@ __all__ = [
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
+ 'reshape',
'lod_reset',
'lrn',
+ 'pad',
]
@@ -3265,6 +3267,8 @@ def one_hot(input, depth):
The one-hot tensor or LodTensor, same as input.
Examples:
+ .. code-block:: python
+
X is a LoDTensor:
X.lod = [[0, 1, 4]]
X.shape = [4, 1]
@@ -3319,6 +3323,102 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
return counter
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+ """
+ Gives a new shape to the input Tensor without changing its data.
+
+ The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+ :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+ variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+ if it is provided, while :attr:`shape` still should be set correctly to
+ gurantee shape inference in compile-time.
+
+ Some tricks exist when specifying the target shape.
+
+ 1. -1 means the value of this dimension is inferred from the total element
+ number of x and remaining dimensions. Thus one and only one dimension can
+ be set -1.
+
+ 2. 0 means the actual dimension value is going to be copied from the
+ corresponding dimension of x. The indice of 0s in shape can not exceed
+ Rank(X).
+
+ Here are some examples to explain it.
+
+ 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+ is [6, 8], the reshape operator will transform x into a 2-D tensor with
+ shape [6, 8] and leaving x's data unchanged.
+
+ 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+ specified is [2, 3, -1, 2], the reshape operator will transform x into a
+ 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+ case, one dimension of the target shape is set to -1, the value of this
+ dimension is inferred from the total element number of x and remaining
+ dimensions.
+
+ 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+ is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+ with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+ besides -1, 0 means the actual dimension value is going to be copied from
+ the corresponding dimension of x.
+
+ Args:
+ input(variable): The input tensor.
+ shape(list): The new shape. At most one dimension of the new shape can
+ be -1.
+ actual_shape(variable): An optional input. If provided, reshape
+ according to this given shape rather than
+ :attr:`shape` specifying shape. That is to
+ say :attr:`actual_shape` has a higher priority
+ than :attr:`shape`.
+ act (str): The non-linear activation to be applied to output variable.
+ inplace(bool): If this flag is set true, a new output tensor is created
+ whose data is copied from input x, otherwise the output
+ shares data with input without copying.
+
+ Returns(variable): The output tensor.
+
+ Examples:
+ .. code-block:: python
+
+ data = fluid.layers.data(
+ name='data', shape=[2, 4, 6], dtype='float32')
+ reshaped = fluid.layers.reshape(
+ x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+ """
+
+ if not (isinstance(shape, list) or isinstance(shape, tuple)):
+ raise ValueError("Input shape must be a python lsit or tuple.")
+
+ # Validate the shape
+ unk_dim_idx = -1
+ for dim_idx, dim_size in enumerate(shape):
+ if dim_size == -1:
+ assert unk_dim_idx == -1, (
+ "Only one dimension in shape can be unknown.")
+ unk_dim_idx = dim_idx
+ elif dim_size == 0:
+ assert dim_idx < len(x.shape), (
+ "The indice of 0s in shape can not exceed Rank(X).")
+ else:
+ assert dim_size > 0, (
+ "Each dimension size given in shape must not be negtive "
+ "except one unknown dimension.")
+
+ helper = LayerHelper("reshape", **locals())
+ reshaped = helper.create_tmp_variable(dtype=x.dtype)
+ helper.append_op(
+ type="reshape",
+ inputs={"X": x,
+ "Shape": actual_shape}
+ if isinstance(actual_shape, Variable) else {"X": x},
+ attrs={"shape": shape,
+ "inplace": inplace},
+ outputs={"Out": reshaped})
+
+ return helper.append_activation(reshaped)
+
+
def lod_reset(x, y=None, target_lod=None):
"""
LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
@@ -3482,3 +3582,62 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"beta": beta})
return lrn_out
+
+
+def pad(x, paddings, pad_value=0., name=None):
+ """
+ Pads a tensor with a constant value given by :attr:`pad_value`, and the
+ padded width is specified by :attr:`paddings`.
+
+ Specifically, the number of values padded before the contents of :attr:`x`
+ in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
+ of values padded after the contents of :attr:`x` in dimension :attr:`i` is
+ indicated by :attr:`paddings[i+1]`.
+
+ See below for an example.
+
+ .. code-block:: text
+
+ Given:
+ x = [[1, 2], [3, 4]]
+
+ paddings = [0, 1, 1, 2]
+
+ pad_value = 0
+
+ Return:
+
+ out = [[0, 1, 2, 0, 0]
+ [0, 3, 4, 0, 0]
+ [0, 0, 0, 0, 0]]
+
+ Args:
+ x (Variable): The input tensor variable.
+ paddings (list): A list of integers. Its elements specify the padded
+ width before and after for each dimension in turn.
+ The length of :attr:paddings must be
+ :math:`rank(x) \\times 2`.
+ pad_value (float): The constant value used to pad.
+ name(str|None): A name for this layer(optional). If set None, the layer
+ will be named automatically.
+
+ Returns:
+ Variable: The padded tensor variable.
+
+ Examples:
+ .. code-block:: python
+
+ # x is a rank 2 tensor variable.
+ out = fluid.layers.pad(
+ x=x, paddings=[0, 1, 1, 2], pad_value=0.)
+ """
+ helper = LayerHelper('pad', input=x, **locals())
+ dtype = helper.input_dtype()
+ out = helper.create_tmp_variable(dtype)
+ helper.append_op(
+ type='pad',
+ inputs={'X': x},
+ outputs={'Out': out},
+ attrs={'paddings': paddings,
+ 'pad_value': float(pad_value)})
+ return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0e5987ee598158d189db8bc956b7e7fea2517554..a9fe25744cc0b385479c9366af1b731ec221dd5a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -49,7 +49,6 @@ __activations__ = [
__all__ = [
'mean',
'mul',
- 'reshape',
'scale',
'sigmoid_cross_entropy_with_logits',
'elementwise_add',
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5e0588fa73241a8752e1b3195a123820165f070d..a2c830b3c943b114f3024f23f73f78bf87e1da34 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -21,7 +21,11 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object):
- def __init__(self, loss_name, use_cuda, num_threads=None):
+ def __init__(self,
+ loss_name,
+ use_cuda,
+ num_threads=None,
+ allow_op_delay=False):
places = []
if use_cuda:
for i in xrange(core.get_cuda_device_count()):
@@ -35,7 +39,12 @@ class ParallelExecutor(object):
places.append(p)
if num_threads is None:
- num_threads = min(len(places) * 2, multiprocessing.cpu_count())
+ if use_cuda:
+ # Experiments on se-resnext shows that too many threads hurt
+ # performance. Worth tunning for other models in the future.
+ num_threads = len(places)
+ else:
+ min(len(places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program()
main = framework.default_main_program()
@@ -52,7 +61,8 @@ class ParallelExecutor(object):
startup.desc,
main.desc,
loss_name,
- scope)
+ scope,
+ allow_op_delay)
self.scope = scope
def run(self, fetch_list):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c7161977e18f91f952fd3a9dc144bf73f0..1b2d29a47fd050e40f83443432f8194984c71214 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,6 +29,7 @@ function(py_test_modules TARGET_NAME)
endfunction()
# test time consuming OPs in a separate process for expliot parallism
+list(REMOVE_ITEM TEST_OPS test_parallel_executor)
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
list(REMOVE_ITEM TEST_OPS test_mul_op)
@@ -64,6 +65,7 @@ else()
endif(WITH_FAST_BUNDLE_TEST)
# tests with high overhead
+py_test_modules(test_parallel_executor MODULES test_parallel_executor)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
py_test_modules(test_mul_op MODULES test_mul_op)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b1c7d361ebea72f2cfc6033268772f0..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
np.allclose(
actual_t, expect_t, atol=atol),
"Output (" + out_name + ") has diff at " + str(place) +
- str(actual_t) + str(expect_t))
+ str(actual_t) + "\n" + str(expect_t))
if isinstance(expect, tuple):
self.assertListEqual(actual.lod(), expect[1],
"Output (" + out_name +
@@ -568,6 +568,6 @@ class OpTest(unittest.TestCase):
fetch_list = [g for p, g in param_grad_list]
executor = Executor(place)
- return map(
- np.array,
- executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+ return map(np.array,
+ executor.run(prog, feed_dict, fetch_list,
+ return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388ff0e01887404e70fe82565b4cd28fa..3f739afd2516fdc2bdf3711d4780a1196c6f3f13 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
self.check_with_place(place)
+class TestLookupTableWIsSelectedRows(OpTest):
+ def check_with_place(self, place):
+ scope = core.Scope()
+
+ # create and initialize Id Variable
+ ids_tensor = scope.var('Ids').get_tensor()
+ ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+ ids_tensor.set(ids_array, place)
+
+ # create and initialize W Variable
+ rows = [0, 1, 2, 3, 4, 5, 6]
+ row_numel = 12
+
+ w_selected_rows = scope.var('W').get_selected_rows()
+ w_selected_rows.set_height(len(rows))
+ w_selected_rows.set_rows(rows)
+ w_array = np.ones((len(rows), row_numel)).astype("float32")
+ for i in range(len(rows)):
+ w_array[i] *= i
+ ids_tensor = w_selected_rows.get_tensor()
+ ids_tensor.set(w_array, place)
+
+ # create Out Variable
+ Out_tensor = scope.var('Out').get_tensor()
+
+ # create and run lookup_table operator
+ lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+ lookup_table.run(scope, place)
+
+ # get result from Out
+ result_array = np.array(Out_tensor)
+ # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+ for idx, row in enumerate(ids_array):
+ assert (row[0] == result_array[idx]).all()
+
+ def test_w_is_selected_rows(self):
+ places = [core.CPUPlace()]
+ # currently only support CPU
+ for place in places:
+ self.check_with_place(place)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 95d0f9da47e97e94ff97eb3647ac5244d5409ca3..a79e4b3e183eaef06be27a724893799923e84ac1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -135,18 +135,18 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-def SE_ResNeXt152(batch_size=4):
+def SE_ResNeXt152Small(batch_size=2):
img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
conv = conv_bn_layer(
- input=img, num_filters=64, filter_size=3, stride=2, act='relu')
+ input=img, num_filters=16, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
- input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+ input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
- input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
+ input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
@@ -184,7 +184,8 @@ class TestParallelExecutorBase(unittest.TestCase):
method,
memory_opt=True,
iter=10,
- batch_size=None):
+ batch_size=None,
+ allow_op_delay=False):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
@@ -194,7 +195,10 @@ class TestParallelExecutorBase(unittest.TestCase):
if memory_opt:
fluid.memory_optimize(main)
- exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+ exe = fluid.ParallelExecutor(
+ loss_name=loss.name,
+ use_cuda=True,
+ allow_op_delay=allow_op_delay)
if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count()
begin = time.time()
@@ -222,7 +226,7 @@ class TestMNIST(TestParallelExecutorBase):
def setUpClass(cls):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
- reader = paddle.batch(mnist.train(), batch_size=32)
+ reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
@@ -236,9 +240,11 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc(self):
self.check_network_convergence(simple_fc_net)
+ self.check_network_convergence(simple_fc_net, allow_op_delay=True)
def test_batchnorm_fc(self):
self.check_network_convergence(fc_with_batchnorm)
+ self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True)
class TestResnet(TestParallelExecutorBase):
@@ -262,10 +268,10 @@ class TestResnet(TestParallelExecutorBase):
def test_resnet(self):
import functools
- batch_size = 4
+ batch_size = 2
self.check_network_convergence(
functools.partial(
- SE_ResNeXt152, batch_size=batch_size),
+ SE_ResNeXt152Small, batch_size=batch_size),
iter=20,
batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c6279528d8f3adf173140a47e233134f43..2ebceca7e4b7b824194d94180462870e6cfe6d21 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
class TestRecvOp(unittest.TestCase):
- def test_send(self):
+ def no_test_send(self):
# Run init_serv in a thread
place = fluid.CPUPlace()
p = Process(target=self.init_serv, args=(place, ))
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
import unittest
import numpy as np
+
from op_test import OpTest
class TestReshapeOp(OpTest):
def setUp(self):
+ ori_shape = (2, 25)
+ new_shape = (5, 10)
+
self.op_type = "reshape"
- self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
- self.attrs = {'shape': [10 * 20]}
- self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape, "inplace": False}
+ self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
self.check_grad(["X"], "Out")
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
def setUp(self):
+ ori_shape = (5, 10)
+ new_shape = (5, -1, 5)
+
self.op_type = "reshape"
- self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
- self.attrs = {'shape': [4, -1, 5]}
- self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape, "inplace": False}
+ self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+ def test_check_output(self):
+ self.check_output()
+
+ def test_check_grad(self):
+ self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+ def setUp(self):
+ ori_shape = (2, 2, 6)
+ new_shape = (2, 0, 3, -1)
+ infered_shape = (2, 2, 3, -1)
+
+ self.op_type = "reshape"
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape, "inplace": False}
+ self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
class TestReshapeOpInplace(OpTest):
def setUp(self):
+ ori_shape = (2, 25)
+ new_shape = (5, 10)
+
+ self.op_type = "reshape"
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape}
+ self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+ def test_check_output(self):
+ self.check_output()
+
+ def test_check_grad(self):
+ self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+ def setUp(self):
+ ori_shape = (5, 10)
+ new_shape = (5, -1, 5)
+
self.op_type = "reshape"
- self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
- self.attrs = {'shape': [10 * 20], 'inplace': True}
- self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape}
+ self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
self.check_grad(["X"], "Out")
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
def setUp(self):
+ ori_shape = (2, 2, 6)
+ new_shape = (2, 0, 3, -1)
+ infered_shape = (2, 2, 3, -1)
+
+ self.op_type = "reshape"
+ self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+ self.attrs = {"shape": new_shape}
+ self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+ def test_check_output(self):
+ self.check_output()
+
+ def test_check_grad(self):
+ self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+ def setUp(self):
+ ori_shape = (6, 5)
+ new_shape = (0, -1, 5)
+ actual_shape = (2, 3, 5)
+
self.op_type = "reshape"
- self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
- self.attrs = {'shape': [4, -1, 5], 'inplace': True}
- self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+ self.inputs = {
+ "X": np.random.random(ori_shape).astype("float32"),
+ "Shape": np.array(
+ actual_shape, dtype="int32")
+ }
+ self.attrs = {"shape": new_shape}
+ self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
def test_check_output(self):
self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
self.check_grad(["X"], "Out")
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 02b0d077eefa431baed05c421a367ebe3581626c..df710c33d0c0ca16d358dac1eb42327e9cd4c7ae 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -22,13 +22,17 @@ import data_type
import topology
import networks
import evaluator
+from . import dataset
+from . import reader
from . import plot
import attr
import op
import pooling
import inference
import networks
+import minibatch
import plot
+import image
import paddle.trainer.config_parser as cp
__all__ = [
@@ -44,11 +48,14 @@ __all__ = [
'data_type',
'attr',
'pooling',
+ 'dataset',
+ 'reader',
'topology',
'networks',
'infer',
'plot',
'evaluator',
+ 'image',
'master',
]
@@ -146,3 +153,4 @@ def init(**kwargs):
infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38056fe0a9496bcb5de76634bbab267e324dc2a4
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+
+__all__ = [
+ 'mnist',
+ 'imikolov',
+ 'imdb',
+ 'cifar',
+ 'movielens',
+ 'conll05',
+ 'sentiment',
+ 'uci_housing',
+ 'wmt14',
+ 'wmt16',
+ 'mq2007',
+ 'flowers',
+ 'voc2012',
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+ def read_batch(batch):
+ data = batch['data']
+ labels = batch.get('labels', batch.get('fine_labels', None))
+ assert labels is not None
+ for sample, label in itertools.izip(data, labels):
+ yield (sample / 255.0).astype(numpy.float32), int(label)
+
+ def reader():
+ with tarfile.open(filename, mode='r') as f:
+ names = (each_item.name for each_item in f
+ if sub_name in each_item.name)
+
+ for name in names:
+ batch = cPickle.load(f.extractfile(name))
+ for item in read_batch(batch):
+ yield item
+
+ return reader
+
+
+def train100():
+ """
+ CIFAR-100 training set creator.
+
+ It returns a reader creator, each sample in the reader is image pixels in
+ [0, 1] and label in [0, 99].
+
+ :return: Training reader creator
+ :rtype: callable
+ """
+ return reader_creator(
+ paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+ 'train')
+
+
+def test100():
+ """
+ CIFAR-100 test set creator.
+
+ It returns a reader creator, each sample in the reader is image pixels in
+ [0, 1] and label in [0, 9].
+
+ :return: Test reader creator.
+ :rtype: callable
+ """
+ return reader_creator(
+ paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+ 'test')
+
+
+def train10():
+ """
+ CIFAR-10 training set creator.
+
+ It returns a reader creator, each sample in the reader is image pixels in
+ [0, 1] and label in [0, 9].
+
+ :return: Training reader creator
+ :rtype: callable
+ """
+ return reader_creator(
+ paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+ 'data_batch')
+
+
+def test10():
+ """
+ CIFAR-10 test set creator.
+
+ It returns a reader creator, each sample in the reader is image pixels in
+ [0, 1] and label in [0, 9].
+
+ :return: Test reader creator.
+ :rtype: callable
+ """
+ return reader_creator(
+ paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+ 'test_batch')
+
+
+def fetch():
+ paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+ paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+ """
+ Converts dataset to recordio format
+ """
+ paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+ paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+ paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+ paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff09a1d1e3ca56877e986c3ed3ae9ecd0a7316
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.v2.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+ 'DATA_HOME',
+ 'download',
+ 'md5file',
+ 'split',
+ 'cluster_files_reader',
+ 'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+ try:
+ os.makedirs(DATA_HOME)
+ except OSError as exc:
+ if exc.errno != errno.EEXIST:
+ raise
+ pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+ hash_md5 = hashlib.md5()
+ f = open(fname, "rb")
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ f.close()
+ return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+ dirname = os.path.join(DATA_HOME, module_name)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+
+ filename = os.path.join(dirname,
+ url.split('/')[-1]
+ if save_name is None else save_name)
+
+ retry = 0
+ retry_limit = 3
+ while not (os.path.exists(filename) and md5file(filename) == md5sum):
+ if os.path.exists(filename):
+ print "file md5", md5file(filename), md5sum
+ if retry < retry_limit:
+ retry += 1
+ else:
+ raise RuntimeError("Cannot download {0} within retry limit {1}".
+ format(url, retry_limit))
+ print "Cache file %s not found, downloading %s" % (filename, url)
+ r = requests.get(url, stream=True)
+ total_length = r.headers.get('content-length')
+
+ if total_length is None:
+ with open(filename, 'w') as f:
+ shutil.copyfileobj(r.raw, f)
+ else:
+ with open(filename, 'w') as f:
+ dl = 0
+ total_length = int(total_length)
+ for data in r.iter_content(chunk_size=4096):
+ dl += len(data)
+ f.write(data)
+ done = int(50 * dl / total_length)
+ sys.stdout.write("\r[%s%s]" % ('=' * done,
+ ' ' * (50 - done)))
+ sys.stdout.flush()
+
+ return filename
+
+
+def fetch_all():
+ for module_name in filter(lambda x: not x.startswith("__"),
+ dir(paddle.v2.dataset)):
+ if "fetch" in dir(
+ importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+ getattr(
+ importlib.import_module("paddle.v2.dataset.%s" % module_name),
+ "fetch")()
+
+
+def fetch_all_recordio(path):
+ for module_name in filter(lambda x: not x.startswith("__"),
+ dir(paddle.v2.dataset)):
+ if "convert" in dir(
+ importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+ not module_name == "common":
+ ds_path = os.path.join(path, module_name)
+ must_mkdirs(ds_path)
+ getattr(
+ importlib.import_module("paddle.v2.dataset.%s" % module_name),
+ "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+ """
+ you can call the function as:
+
+ split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+ suffix="imikolov-train-%05d.pickle")
+
+ the output files as:
+
+ |-imikolov-train-00000.pickle
+ |-imikolov-train-00001.pickle
+ |- ...
+ |-imikolov-train-00480.pickle
+
+ :param reader: is a reader creator
+ :param line_count: line count for each file
+ :param suffix: the suffix for the output files, should contain "%d"
+ means the id for each file. Default is "%05d.pickle"
+ :param dumper: is a callable function that dump object to file, this
+ function will be called as dumper(obj, f) and obj is the object
+ will be dumped, f is a file object. Default is cPickle.dump.
+ """
+ if not callable(dumper):
+ raise TypeError("dumper should be callable.")
+ lines = []
+ indx_f = 0
+ for i, d in enumerate(reader()):
+ lines.append(d)
+ if i >= line_count and i % line_count == 0:
+ with open(suffix % indx_f, "w") as f:
+ dumper(lines, f)
+ lines = []
+ indx_f += 1
+ if lines:
+ with open(suffix % indx_f, "w") as f:
+ dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+ trainer_count,
+ trainer_id,
+ loader=cPickle.load):
+ """
+ Create a reader that yield element from the given files, select
+ a file set according trainer count and trainer_id
+
+ :param files_pattern: the files which generating by split(...)
+ :param trainer_count: total trainer count
+ :param trainer_id: the trainer rank id
+ :param loader: is a callable function that load object from file, this
+ function will be called as loader(f) and f is a file object.
+ Default is cPickle.load
+ """
+
+ def reader():
+ if not callable(loader):
+ raise TypeError("loader should be callable.")
+ file_list = glob.glob(files_pattern)
+ file_list.sort()
+ my_file_list = []
+ for idx, fn in enumerate(file_list):
+ if idx % trainer_count == trainer_id:
+ print "append file: %s" % fn
+ my_file_list.append(fn)
+ for fn in my_file_list:
+ with open(fn, "r") as f:
+ lines = loader(f)
+ for line in lines:
+ yield line
+
+ return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+ import recordio
+ """
+ Convert data from reader to recordio format files.
+
+ :param output_path: directory in which output files will be saved.
+ :param reader: a data reader, from which the convert program will read
+ data instances.
+ :param name_prefix: the name prefix of generated files.
+ :param max_lines_to_shuffle: the max lines numbers to shuffle before
+ writing.
+ """
+
+ assert line_count >= 1
+ indx_f = 0
+
+ def write_data(indx_f, lines):
+ filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+ writer = recordio.writer(filename)
+ for l in lines:
+ # FIXME(Yancey1989):
+ # dumps with protocol: pickle.HIGHEST_PROTOCOL
+ writer.write(cPickle.dumps(l))
+ writer.close()
+
+ lines = []
+ for i, d in enumerate(reader()):
+ lines.append(d)
+ if i % line_count == 0 and i >= line_count:
+ write_data(indx_f, lines)
+ lines = []
+ indx_f += 1
+ continue
+
+ write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.v2.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+ d = dict()
+ tag_dict = set()
+ with open(filename, 'r') as f:
+ for i, line in enumerate(f):
+ line = line.strip()
+ if line.startswith("B-"):
+ tag_dict.add(line[2:])
+ elif line.startswith("I-"):
+ tag_dict.add(line[2:])
+ index = 0
+ for tag in tag_dict:
+ d["B-" + tag] = index
+ index += 1
+ d["I-" + tag] = index
+ index += 1
+ d["O"] = index
+ return d
+
+
+def load_dict(filename):
+ d = dict()
+ with open(filename, 'r') as f:
+ for i, line in enumerate(f):
+ d[line.strip()] = i
+ return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+ """
+ Read one corpus. It returns an iterator. Each element of
+ this iterator is a tuple including sentence and labels. The sentence is
+ consist of a list of word IDs. The labels include a list of label IDs.
+ :return: a iterator of data.
+ :rtype: iterator
+ """
+
+ def reader():
+ tf = tarfile.open(data_path)
+ wf = tf.extractfile(words_name)
+ pf = tf.extractfile(props_name)
+ with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+ fileobj=pf) as props_file:
+ sentences = []
+ labels = []
+ one_seg = []
+ for word, label in itertools.izip(words_file, props_file):
+ word = word.strip()
+ label = label.strip().split()
+
+ if len(label) == 0: # end of sentence
+ for i in xrange(len(one_seg[0])):
+ a_kind_lable = [x[i] for x in one_seg]
+ labels.append(a_kind_lable)
+
+ if len(labels) >= 1:
+ verb_list = []
+ for x in labels[0]:
+ if x != '-':
+ verb_list.append(x)
+
+ for i, lbl in enumerate(labels[1:]):
+ cur_tag = 'O'
+ is_in_bracket = False
+ lbl_seq = []
+ verb_word = ''
+ for l in lbl:
+ if l == '*' and is_in_bracket == False:
+ lbl_seq.append('O')
+ elif l == '*' and is_in_bracket == True:
+ lbl_seq.append('I-' + cur_tag)
+ elif l == '*)':
+ lbl_seq.append('I-' + cur_tag)
+ is_in_bracket = False
+ elif l.find('(') != -1 and l.find(')') != -1:
+ cur_tag = l[1:l.find('*')]
+ lbl_seq.append('B-' + cur_tag)
+ is_in_bracket = False
+ elif l.find('(') != -1 and l.find(')') == -1:
+ cur_tag = l[1:l.find('*')]
+ lbl_seq.append('B-' + cur_tag)
+ is_in_bracket = True
+ else:
+ raise RuntimeError('Unexpected label: %s' %
+ l)
+
+ yield sentences, verb_list[i], lbl_seq
+
+ sentences = []
+ labels = []
+ one_seg = []
+ else:
+ sentences.append(word)
+ one_seg.append(label)
+
+ pf.close()
+ wf.close()
+ tf.close()
+
+ return reader
+
+
+def reader_creator(corpus_reader,
+ word_dict=None,
+ predicate_dict=None,
+ label_dict=None):
+ def reader():
+ for sentence, predicate, labels in corpus_reader():
+
+ sen_len = len(sentence)
+
+ verb_index = labels.index('B-V')
+ mark = [0] * len(labels)
+ if verb_index > 0:
+ mark[verb_index - 1] = 1
+ ctx_n1 = sentence[verb_index - 1]
+ else:
+ ctx_n1 = 'bos'
+
+ if verb_index > 1:
+ mark[verb_index - 2] = 1
+ ctx_n2 = sentence[verb_index - 2]
+ else:
+ ctx_n2 = 'bos'
+
+ mark[verb_index] = 1
+ ctx_0 = sentence[verb_index]
+
+ if verb_index < len(labels) - 1:
+ mark[verb_index + 1] = 1
+ ctx_p1 = sentence[verb_index + 1]
+ else:
+ ctx_p1 = 'eos'
+
+ if verb_index < len(labels) - 2:
+ mark[verb_index + 2] = 1
+ ctx_p2 = sentence[verb_index + 2]
+ else:
+ ctx_p2 = 'eos'
+
+ word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+ ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+ ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+ ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+ ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+ ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+ pred_idx = [predicate_dict.get(predicate)] * sen_len
+ label_idx = [label_dict.get(w) for w in labels]
+
+ yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+ ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+ return reader
+
+
+def get_dict():
+ """
+ Get the word, verb and label dictionary of Wikipedia corpus.
+ """
+ word_dict = load_dict(
+ paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+ WORDDICT_MD5))
+ verb_dict = load_dict(
+ paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+ VERBDICT_MD5))
+ label_dict = load_label_dict(
+ paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+ TRGDICT_MD5))
+ return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+ """
+ Get the trained word vector based on Wikipedia corpus.
+ """
+ return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+ """
+ Conll05 test set creator.
+
+ Because the training dataset is not free, the test dataset is used for
+ training. It returns a reader creator, each sample in the reader is nine
+ features, including sentence sequence, predicate, predicate context,
+ predicate context flag and tagged sequence.
+
+ :return: Training reader creator
+ :rtype: callable
+ """
+ word_dict, verb_dict, label_dict = get_dict()
+ reader = corpus_reader(
+ paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+ words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+ props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+ return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+ paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+ paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+ paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+ paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+ paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+ """
+ Converts dataset to recordio format
+ """
+ paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+ paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bdddeaabec733ef26b3f766c6437f5c53d65044
--- /dev/null
+++ b/python/paddle/v2/dataset/flowers.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+from paddle.v2.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+ '''
+ map image bytes data to type needed by model input layer
+ '''
+ img, label = sample
+ img = load_image_bytes(img)
+ img = simple_transform(
+ img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+ return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+ label_file,
+ setid_file,
+ dataset_name,
+ mapper,
+ buffered_size=1024,
+ use_xmap=True):
+ '''
+ 1. read images from tar file and
+ merge images into batch files in 102flowers.tgz_batch/
+ 2. get a reader to read sample from batch file
+
+ :param data_file: downloaded data file
+ :type data_file: string
+ :param label_file: downloaded label file
+ :type label_file: string
+ :param setid_file: downloaded setid file containing information
+ about how to split dataset
+ :type setid_file: string
+ :param dataset_name: data set name (tstid|trnid|valid)
+ :type dataset_name: string
+ :param mapper: a function to map image bytes data to type
+ needed by model input layer
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: data reader
+ :rtype: callable
+ '''
+ labels = scio.loadmat(label_file)['labels'][0]
+ indexes = scio.loadmat(setid_file)[dataset_name][0]
+ img2label = {}
+ for i in indexes:
+ img = "jpg/image_%05d.jpg" % i
+ img2label[img] = labels[i - 1]
+ file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+ def reader():
+ for file in open(file_list):
+ file = file.strip()
+ batch = None
+ with open(file, 'r') as f:
+ batch = cPickle.load(f)
+ data = batch['data']
+ labels = batch['label']
+ for sample, label in itertools.izip(data, batch['label']):
+ yield sample, int(label) - 1
+
+ if use_xmap:
+ return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+ else:
+ return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+ '''
+ Create flowers training set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: train data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+ buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+ '''
+ Create flowers test set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+ buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+ '''
+ Create flowers validation set reader.
+ It returns a reader, each sample in the reader is
+ image pixels in [0, 1] and label in [1, 102]
+ translated from original color image by steps:
+ 1. resize to 256*256
+ 2. random crop to 224*224
+ 3. flatten
+ :param mapper: a function to map sample.
+ :type mapper: callable
+ :param buffered_size: the size of buffer used to process images
+ :type buffered_size: int
+ :return: test data reader
+ :rtype: callable
+ '''
+ return reader_creator(
+ download(DATA_URL, 'flowers', DATA_MD5),
+ download(LABEL_URL, 'flowers', LABEL_MD5),
+ download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+ buffered_size, use_xmap)
+
+
+def fetch():
+ download(DATA_URL, 'flowers', DATA_MD5)
+ download(LABEL_URL, 'flowers', LABEL_MD5)
+ download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..37c4296f9bcea7e16daa46f778934331513c30c4
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.v2.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+ """
+ Read files that match the given pattern. Tokenize and yield each file.
+ """
+
+ with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+ MD5)) as tarf:
+ # Note that we should use tarfile.next(), which does
+ # sequential access of member files, other than
+ # tarfile.extractfile, which does random access and might
+ # destroy hard disks.
+ tf = tarf.next()
+ while tf != None:
+ if bool(pattern.match(tf.name)):
+ # newline and punctuations removal and ad-hoc tokenization.
+ yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+ None, string.punctuation).lower().split()
+ tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+ """
+ Build a word dictionary from the corpus. Keys of the dictionary are words,
+ and values are zero-based IDs of these words.
+ """
+ word_freq = collections.defaultdict(int)
+ for doc in tokenize(pattern):
+ for word in doc:
+ word_freq[word] += 1
+
+ # Not sure if we should prune less-frequent words here.
+ word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+ dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+ words, _ = list(zip(*dictionary))
+ word_idx = dict(zip(words, xrange(len(words))))
+ word_idx[''] = len(words)
+ return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+ UNK = word_idx['']
+ INS = []
+
+ def load(pattern, out, label):
+ for doc in tokenize(pattern):
+ out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+ load(pos_pattern, INS, 0)
+ load(neg_pattern, INS, 1)
+
+ def reader():
+ for doc, label in INS:
+ yield doc, label
+
+ return reader
+
+
+def train(word_idx):
+ """
+ IMDB training set creator.
+
+ It returns a reader creator, each sample in the reader is an zero-based ID
+ sequence and label in [0, 1].
+
+ :param word_idx: word dictionary
+ :type word_idx: dict
+ :return: Training reader creator
+ :rtype: callable
+ """
+ return reader_creator(
+ re.compile("aclImdb/train/pos/.*\.txt$"),
+ re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+ """
+ IMDB test set creator.
+
+ It returns a reader creator, each sample in the reader is an zero-based ID
+ sequence and label in [0, 1].
+
+ :param word_idx: word dictionary
+ :type word_idx: dict
+ :return: Test reader creator
+ :rtype: callable
+ """
+ return reader_creator(
+ re.compile("aclImdb/test/pos/.*\.txt$"),
+ re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+ """
+ Build a word dictionary from the corpus.
+
+ :return: Word dictionary
+ :rtype: dict
+ """
+ return build_dict(
+ re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+ paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+ """
+ Converts dataset to recordio format
+ """
+ w = word_dict()
+ paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+ paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..617c722c4165cdfed9e650fc968d623ef6ed4391
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+ NGRAM = 1
+ SEQ = 2
+
+
+def word_count(f, word_freq=None):
+ if word_freq is None:
+ word_freq = collections.defaultdict(int)
+
+ for l in f:
+ for w in l.strip().split():
+ word_freq[w] += 1
+ word_freq['
'] += 1
+ word_freq[''] += 1
+
+ return word_freq
+
+
+def build_dict(min_word_freq=50):
+ """
+ Build a word dictionary from the corpus, Keys of the dictionary are words,
+ and values are zero-based IDs of these words.
+ """
+ train_filename = './simple-examples/data/ptb.train.txt'
+ test_filename = './simple-examples/data/ptb.valid.txt'
+ with tarfile.open(
+ paddle.v2.dataset.common.download(
+ paddle.v2.dataset.imikolov.URL, 'imikolov',
+ paddle.v2.dataset.imikolov.MD5)) as tf:
+ trainf = tf.extractfile(train_filename)
+ testf = tf.extractfile(test_filename)
+ word_freq = word_count(testf, word_count(trainf))
+ if '' in word_freq:
+ # remove for now, since we will set it as last index
+ del word_freq['']
+
+ word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+ word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+ words, _ = list(zip(*word_freq_sorted))
+ word_idx = dict(zip(words, xrange(len(words))))
+ word_idx[''] = len(words)
+
+ return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+ def reader():
+ with tarfile.open(
+ paddle.v2.dataset.common.download(
+ paddle.v2.dataset.imikolov.URL, 'imikolov',
+ paddle.v2.dataset.imikolov.MD5)) as tf:
+ f = tf.extractfile(filename)
+
+ UNK = word_idx['