diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867 --- /dev/null +++ b/benchmark/tensorflow/machine_translation.py @@ -0,0 +1,626 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.layers.core import Dense +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.framework import ops +from tensorflow.python.ops import rnn_cell_impl +from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell +from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple +from tensorflow.contrib.rnn.python.ops import core_rnn_cell +from tensorflow.python.ops import array_ops +from tensorflow.python.util import nest +import tensorflow.contrib.seq2seq as seq2seq +from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder +import numpy as np +import os +import argparse +import time + +import paddle.v2 as paddle + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=128, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--max_time_steps", + type=int, + default=81, + help="Max number of time steps for sequence. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=10, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--max_generation_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") +parser.add_argument( + "--save_freq", + type=int, + default=500, + help="Save model checkpoint every this interation. (default: %(default)d)") +parser.add_argument( + "--model_dir", + type=str, + default='./checkpoint', + help="Path to save model checkpoints. (default: %(default)d)") + +_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name + +START_TOKEN_IDX = 0 +END_TOKEN_IDX = 1 + + +class LSTMCellWithSimpleAttention(RNNCell): + """Add attention mechanism to BasicLSTMCell. + This class is a wrapper based on tensorflow's `BasicLSTMCell`. + """ + + def __init__(self, + num_units, + encoder_vector, + encoder_proj, + source_sequence_length, + forget_bias=1.0, + state_is_tuple=True, + activation=None, + reuse=None): + super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse) + if not state_is_tuple: + logging.warn("%s: Using a concatenated state is slower and will " + "soon be deprecated. Use state_is_tuple=True.", self) + self._num_units = num_units + # set padding part to 0 + self._encoder_vector = self._reset_padding(encoder_vector, + source_sequence_length) + self._encoder_proj = self._reset_padding(encoder_proj, + source_sequence_length) + self._forget_bias = forget_bias + self._state_is_tuple = state_is_tuple + self._activation = activation or math_ops.tanh + self._linear = None + + @property + def state_size(self): + return (LSTMStateTuple(self._num_units, self._num_units) \ + if self._state_is_tuple else 2 * self._num_units) + + @property + def output_size(self): + return self._num_units + + def zero_state(self, batch_size, dtype): + state_size = self.state_size + if hasattr(self, "_last_zero_state"): + (last_state_size, last_batch_size, last_dtype, + last_output) = getattr(self, "_last_zero_state") + if (last_batch_size == batch_size and last_dtype == dtype and + last_state_size == state_size): + return last_output + with ops.name_scope( + type(self).__name__ + "ZeroState", values=[batch_size]): + output = _zero_state_tensors(state_size, batch_size, dtype) + self._last_zero_state = (state_size, batch_size, dtype, output) + return output + + def call(self, inputs, state): + sigmoid = math_ops.sigmoid + # Parameters of gates are concatenated into one multiply for efficiency. + if self._state_is_tuple: + c, h = state + else: + c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) + + # get context from encoder outputs + context = self._simple_attention(self._encoder_vector, + self._encoder_proj, h) + + if self._linear is None: + self._linear = _Linear([inputs, context, h], 4 * self._num_units, + True) + # i = input_gate, j = new_input, f = forget_gate, o = output_gate + i, j, f, o = array_ops.split( + value=self._linear([inputs, context, h]), + num_or_size_splits=4, + axis=1) + + new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * + self._activation(j)) + new_h = self._activation(new_c) * sigmoid(o) + + if self._state_is_tuple: + new_state = LSTMStateTuple(new_c, new_h) + else: + new_state = array_ops.concat([new_c, new_h], 1) + return new_h, new_state + + def _simple_attention(self, encoder_vec, encoder_proj, decoder_state): + """Implement the attention function. + The implementation has the same logic to the fluid decoder. + """ + decoder_state_proj = tf.contrib.layers.fully_connected( + inputs=decoder_state, + num_outputs=self._num_units, + activation_fn=None, + biases_initializer=None) + decoder_state_expand = tf.tile( + tf.expand_dims( + input=decoder_state_proj, axis=1), + [1, tf.shape(encoder_proj)[1], 1]) + concated = tf.concat([decoder_state_expand, encoder_proj], axis=2) + # need reduce the first dimension + attention_weights = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + concated, shape=[-1, self._num_units * 2]), + num_outputs=1, + activation_fn=tf.nn.tanh, + biases_initializer=None) + attention_weights_reshaped = tf.reshape( + attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1]) + # normalize the attention weights using softmax + attention_weights_normed = tf.nn.softmax( + attention_weights_reshaped, dim=1) + scaled = tf.multiply(attention_weights_normed, encoder_vec) + context = tf.reduce_sum(scaled, axis=1) + return context + + def _reset_padding(self, + memory, + memory_sequence_length, + check_inner_dims_defined=True): + """Reset the padding part for encoder inputs. + This funtion comes from tensorflow's `_prepare_memory` function. + """ + memory = nest.map_structure( + lambda m: ops.convert_to_tensor(m, name="memory"), memory) + if memory_sequence_length is not None: + memory_sequence_length = ops.convert_to_tensor( + memory_sequence_length, name="memory_sequence_length") + if check_inner_dims_defined: + + def _check_dims(m): + if not m.get_shape()[2:].is_fully_defined(): + raise ValueError( + "Expected memory %s to have fully defined inner dims, " + "but saw shape: %s" % (m.name, m.get_shape())) + + nest.map_structure(_check_dims, memory) + if memory_sequence_length is None: + seq_len_mask = None + else: + seq_len_mask = array_ops.sequence_mask( + memory_sequence_length, + maxlen=array_ops.shape(nest.flatten(memory)[0])[1], + dtype=nest.flatten(memory)[0].dtype) + seq_len_batch_size = (memory_sequence_length.shape[0].value or + array_ops.shape(memory_sequence_length)[0]) + + def _maybe_mask(m, seq_len_mask): + rank = m.get_shape().ndims + rank = rank if rank is not None else array_ops.rank(m) + extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32) + m_batch_size = m.shape[0].value or array_ops.shape(m)[0] + if memory_sequence_length is not None: + message = ("memory_sequence_length and memory tensor " + "batch sizes do not match.") + with ops.control_dependencies([ + check_ops.assert_equal( + seq_len_batch_size, m_batch_size, message=message) + ]): + seq_len_mask = array_ops.reshape( + seq_len_mask, + array_ops.concat( + (array_ops.shape(seq_len_mask), extra_ones), 0)) + return m * seq_len_mask + else: + return m + + return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), + memory) + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, + max_generation_length): + src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + + src_embedding_weights = tf.get_variable("source_word_embeddings", + [source_dict_dim, embedding_dim]) + src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) + + src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + # no peephole + encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( + cell_fw=src_forward_cell, + cell_bw=src_reversed_cell, + inputs=src_embedding, + sequence_length=src_sequence_length, + dtype=tf.float32) + + # concat the forward outputs and backward outputs + encoded_vec = tf.concat(encoder_outputs, axis=2) + + # project the encoder outputs to size of decoder lstm + encoded_proj = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + encoded_vec, shape=[-1, embedding_dim * 2]), + num_outputs=decoder_size, + activation_fn=None, + biases_initializer=None) + encoded_proj_reshape = tf.reshape( + encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) + + # get init state for decoder lstm's H + backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) + decoder_boot = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + backword_first, shape=[-1, embedding_dim]), + num_outputs=decoder_size, + activation_fn=tf.nn.tanh, + biases_initializer=None) + + # prepare the initial state for decoder lstm + cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) + initial_state = LSTMStateTuple(cell_init, decoder_boot) + + # create decoder lstm cell + decoder_cell = LSTMCellWithSimpleAttention( + decoder_size, + encoded_vec + if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), + encoded_proj_reshape if not is_generating else + seq2seq.tile_batch(encoded_proj_reshape, beam_size), + src_sequence_length if not is_generating else + seq2seq.tile_batch(src_sequence_length, beam_size), + forget_bias=0.0) + + output_layer = Dense(target_dict_dim, name='output_projection') + + if not is_generating: + trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, + trg_word_idx) + + training_helper = seq2seq.TrainingHelper( + inputs=trg_embedding, + sequence_length=trg_sequence_length, + time_major=False, + name='training_helper') + + training_decoder = seq2seq.BasicDecoder( + cell=decoder_cell, + helper=training_helper, + initial_state=initial_state, + output_layer=output_layer) + + # get the max length of target sequence + max_decoder_length = tf.reduce_max(trg_sequence_length) + + decoder_outputs_train, _, _ = seq2seq.dynamic_decode( + decoder=training_decoder, + output_time_major=False, + impute_finished=True, + maximum_iterations=max_decoder_length) + + decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) + decoder_pred_train = tf.argmax( + decoder_logits_train, axis=-1, name='decoder_pred_train') + masks = tf.sequence_mask( + lengths=trg_sequence_length, + maxlen=max_decoder_length, + dtype=tf.float32, + name='masks') + + # place holder of label sequence + lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + + # compute the loss + loss = seq2seq.sequence_loss( + logits=decoder_logits_train, + targets=lbl_word_idx, + weights=masks, + average_across_timesteps=True, + average_across_batch=True) + + # return feeding list and loss operator + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_word_idx, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_word_idx + }, loss + else: + start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], + tf.int32) * START_TOKEN_IDX + # share the same embedding weights with target word + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + + inference_decoder = beam_search_decoder.BeamSearchDecoder( + cell=decoder_cell, + embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), + start_tokens=start_tokens, + end_token=END_TOKEN_IDX, + initial_state=tf.nn.rnn_cell.LSTMStateTuple( + tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), + tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), + beam_width=beam_size, + output_layer=output_layer) + + decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( + decoder=inference_decoder, + output_time_major=False, + #impute_finished=True,# error occurs + maximum_iterations=max_generation_length) + + predicted_ids = decoder_outputs_decode.predicted_ids + + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length + }, predicted_ids + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in vars(args).iteritems(): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def save(sess, path, var_list=None, global_step=None): + saver = tf.train.Saver(var_list) + save_path = saver.save(sess, save_path=path, global_step=global_step) + print('Model save at %s' % save_path) + + +def restore(sess, path, var_list=None): + # var_list = None returns the list of all saveable variables + saver = tf.train.Saver(var_list) + saver.restore(sess, save_path=path) + print('model restored from %s' % path) + + +def adapt_batch_data(data): + src_seq = map(lambda x: x[0], data) + trg_seq = map(lambda x: x[1], data) + lbl_seq = map(lambda x: x[2], data) + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + + trg_sequence_length = np.array( + [len(seq) for seq in trg_seq]).astype('int32') + trg_seq_maxlen = np.max(trg_sequence_length) + + src_seq = np.array( + [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq]).astype('int32') + + trg_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in trg_seq]).astype('int32') + + lbl_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in lbl_seq]).astype('int32') + + return { + 'src_word_idx': src_seq, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_seq, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_seq + } + + +def train(): + feeding_dict, loss = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=False, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + global_step = tf.Variable(0, trainable=False, name='global_step') + trainable_params = tf.trainable_variables() + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + gradients = tf.gradients(loss, trainable_params) + # may clip the parameters + clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) + + updates = optimizer.apply_gradients( + zip(gradients, trainable_params), global_step=global_step) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + def do_validataion(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + outputs = sess.run([loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + total_loss += outputs[0] + count += 1 + return total_loss / count + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + words_seen += np.sum(adapted_batch_data['src_sequence_length']) + words_seen += np.sum(adapted_batch_data['trg_sequence_length']) + outputs = sess.run([updates, loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + print("pass_id=%d, batch_id=%d, train_loss: %f" % + (pass_id, batch_id, outputs[1])) + pass_end_time = time.time() + test_loss = do_validataion() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + feeding_dict, predicted_ids = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=True, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + with tf.Session(config=config) as sess: + restore(sess, './checkpoint/tf_seq2seq-1500') + for batch_id, data in enumerate(test_batch_generator()): + src_seq = map(lambda x: x[0], data) + + source_language_seq = [ + src_dict[item] for seq in src_seq for item in seq + ] + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + src_seq = np.array([ + padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq + ]).astype('int32') + + outputs = sess.run([predicted_ids], + feed_dict={ + feeding_dict['src_word_idx']: src_seq, + feeding_dict['src_sequence_length']: + src_sequence_length + }) + + print("\nDecoder result comparison: ") + source_language_seq = ' '.join(source_language_seq).lstrip( + '').rstrip('').strip() + inference_seq = '' + print(" --> source: " + source_language_seq) + for item in outputs[0][0]: + if item[0] == END_TOKEN_IDX: break + inference_seq += ' ' + trg_dict.get(item[0], '') + print(" --> inference: " + inference_seq) + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + if args.infer_only: + infer() + else: + train() diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf --- /dev/null +++ b/benchmark/tensorflow/mnist.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import tensorflow as tf +import paddle.v2 as paddle + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + args = parser.parse_args() + return args + + +def run_benchmark(args): + def weight_variable(dtype, shape): + initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype) + return tf.Variable(initial) + + def bias_variable(dtype, shape): + initial = tf.constant(0.1, shape=shape, dtype=dtype) + return tf.Variable(initial) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + with tf.device(device): + images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1)) + labels = tf.placeholder(tf.int64, shape=(None, )) + + # conv1, relu, pool1 + conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20]) + conv1_bias = bias_variable(DTYPE, [20]) + conv1 = tf.nn.conv2d( + images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID") + relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias)) + pool1 = tf.nn.max_pool( + relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # conv2, relu, pool2 + conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50]) + conv2_bias = bias_variable(DTYPE, [50]) + conv2 = tf.nn.conv2d( + pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID") + relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias)) + pool2 = tf.nn.max_pool( + relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # FC + pool_shape = pool2.get_shape().as_list() + hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1) + reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim)) + fc_weights = weight_variable(DTYPE, [hidden_dim, 10]) + fc_bias = bias_variable(DTYPE, [10]) + logits = tf.matmul(reshape, fc_weights) + fc_bias + + # Get prediction + prediction = tf.nn.softmax(logits) + + # Loss + one_hot_labels = tf.one_hot(labels, depth=10) + cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1]) + avg_cost = tf.reduce_mean(cost) + + # Get accuracy + correct = tf.equal(tf.argmax(prediction, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + # metrics, g_accuracy + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_accuracy = tf.metrics.accuracy( + labels, tf.argmax( + prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + g_accuracy_reset_op = tf.variables_initializer(vars) + + # Optimizer + opt = tf.train.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + train_op = opt.minimize(avg_cost) + # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost) + + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + + def eval_test(): + sess.run(g_accuracy_reset_op) + for batch_id, data in enumerate(test_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype("int64") + + loss, acc, g_acc = sess.run( + [avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + return g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + for pass_id in range(args.pass_num): + sess.run(g_accuracy_reset_op) + + pass_start = time.time() + for batch_id, data in enumerate(train_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + "int64") + + start = time.time() + _, loss, acc, g_acc = sess.run( + [train_op, avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + end = time.time() + + print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % + (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + + pass_end = time.time() + test_avg_acc = eval_test() + + print( + "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f" + % (pass_id, g_acc[1], test_avg_acc, + (pass_end - pass_start) / 1000)) + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + run_benchmark(args) diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4 --- /dev/null +++ b/benchmark/tensorflow/resnet.py @@ -0,0 +1,504 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py + +Get help: python resnet.py --help +See performance on flowers: python resnet.py +Train on cifar10: python resnet.py --data=cifar10 --with_test +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import paddle.v2 as paddle +import tensorflow as tf + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser('Convolution model benchmark.') + parser.add_argument( + '--model', + type=str, + choices=['resnet'], + default='resnet', + help='The model architecture.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='use real data or fake data') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', + type=int, + default=105, + help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=300, help='The number of passes.') + parser.add_argument( + '--order', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--data', + type=str, + default='flowers102', + choices=['flowers102', 'cifar10'], + help='The kinds of data.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[ + 'with_test'] else vars(args)['iterations'] + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def fixed_padding(inputs, kernel_size, data_format): + """Pads the input along the spatial dimensions independently of input size. + Args: + inputs: A tensor of size [batch, channels, height_in, width_in] or + [batch, height_in, width_in, channels] depending on data_format. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + Should be a positive integer. + data_format: The input format ('channels_last' or 'channels_first'). + Returns: + A tensor with the same format as the input with the data either intact + (if kernel_size == 1) or padded (if kernel_size > 1). + """ + pad_total = kernel_size - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + + if data_format == 'channels_first': + padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end], + [pad_beg, pad_end]]) + else: + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], + [pad_beg, pad_end], [0, 0]]) + return padded_inputs + + +def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + """Strided 2-D convolution with explicit padding.""" + # The padding is consistent and is based only on `kernel_size`, not on the + # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). + # This is consistent with PaddlePaddle. + # In addition, the calculation for output size in TensorFlow can refer: + # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc + if strides > 1: + inputs = fixed_padding(inputs, kernel_size, data_format) + + return tf.layers.conv2d( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=('SAME' if strides == 1 else 'VALID'), + use_bias=False, + kernel_initializer=tf.variance_scaling_initializer(), + data_format=data_format) + + +def conv_bn(inputs, + filters, + kernel_size, + strides, + is_training, + data_format, + act=True): + # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + # set fused=True for a significant performance boost. See + # https://www.tensorflow.org/performance/performance_guide#common_fused_ops + inputs = conv2d_fixed_padding( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + data_format=data_format) + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if data_format == 'channels_first' else 3, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + if act: + inputs = tf.nn.relu(inputs) + return inputs + + +def basicblock(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def bottleneck(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = conv_bn( + inputs, filters * 4, 1, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name, + data_format): + # Bottleneck blocks end with 4x the number of filters as they start with + filters_out = 4 * filters if block_fn is bottleneck else filters + + def projection_shortcut(inputs): + return conv2d_fixed_padding( + inputs=inputs, + filters=filters_out, + kernel_size=1, + strides=strides, + data_format=data_format) + + # Only the first block per block_layer uses projection_shortcut and strides + inputs = block_fn(inputs, filters, is_training, projection_shortcut, + strides, data_format) + + for _ in range(1, blocks): + inputs = block_fn(inputs, filters, is_training, None, 1, data_format) + + return tf.identity(inputs, name) + + +def resnet_imagenet(depth, class_dim, data_format): + """Returns the ResNet model for a given size and number of output classes.""" + + def resnet_generator(block_fn, + layers, + num_classes, + data_format='channels_last'): + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + """Constructs the ResNet model given the inputs.""" + if data_format == 'channels_first': + # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). + # This provides a large performance boost on GPU. See + # https://www.tensorflow.org/performance/performance_guide#data_formats + inputs = tf.transpose(inputs, [0, 3, 1, 2]) + + inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = tf.layers.max_pooling2d( + inputs=inputs, + pool_size=3, + strides=2, + padding='SAME', + data_format=data_format) + inputs = tf.identity(inputs, 'initial_max_pool') + inputs = block_layer(inputs, 64, block_fn, layers[0], 1, + is_training, 'block_layer1', data_format) + inputs = block_layer(inputs, 128, block_fn, layers[1], 2, + is_training, 'block_layer2', data_format) + inputs = block_layer(inputs, 256, block_fn, layers[2], 2, + is_training, 'block_layer3', data_format) + inputs = block_layer(inputs, 512, block_fn, layers[3], 2, + is_training, 'block_layer4', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=7, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, + [-1, 512 if block_fn is basicblock else 2048]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + model_params = { + 18: { + 'block': basicblock, + 'layers': [2, 2, 2, 2] + }, + 34: { + 'block': basicblock, + 'layers': [3, 4, 6, 3] + }, + 50: { + 'block': bottleneck, + 'layers': [3, 4, 6, 3] + }, + 101: { + 'block': bottleneck, + 'layers': [3, 4, 23, 3] + }, + 152: { + 'block': bottleneck, + 'layers': [3, 8, 36, 3] + }, + 200: { + 'block': bottleneck, + 'layers': [3, 24, 36, 3] + } + } + if depth not in model_params: + raise ValueError('Not a valid depth:', depth) + params = model_params[depth] + return resnet_generator(params['block'], params['layers'], class_dim, + data_format) + + +def resnet_cifar10(depth, num_classes, data_format): + if depth % 6 != 2: + raise ValueError('depth must be 6n + 2:', depth) + + num_blocks = (depth - 2) // 6 + + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training, + 'block_layer1', data_format) + inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training, + 'block_layer2', data_format) + inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training, + 'block_layer3', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=8, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, [-1, 64]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + +def run_benchmark(args, data_format='channels_last', device='/cpu:0'): + """Our model_fn for ResNet to be used with our Estimator.""" + + class_dim = 1000 + dshape = (None, 224, 224, 3) + + pdshape = (3, 224, 224) + if args.data == 'flowers102': + class_dim = 102 + dshape = (None, 224, 224, 3) + pdshape = (3, 224, 224) + elif args.data == 'cifar10': + class_dim = 10 + dshape = (None, 32, 32, 3) + pdshape = (3, 32, 32) + + with tf.device(device): + images = tf.placeholder(DTYPE, shape=dshape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + network = resnet_cifar10( + 32, class_dim, + data_format) if args.data == 'cifar10' else resnet_imagenet( + 50, class_dim, data_format) + + logits = network(inputs=images, is_training=is_training) + + cross_entropy = tf.losses.softmax_cross_entropy( + logits=logits, onehot_labels=onehot_labels) + avg_cost = tf.reduce_mean(cross_entropy) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + lr = 0.1 if args.data == 'cifar10' else 0.01 + optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) + + # Batch norm requires update_ops to be added as a train_op dependency. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=100) + + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, np.mean(test_accs))) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + + if args.use_fake_data: + data = train_reader().next() + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype('int64') + iters, num_samples, start_time = 0, 0, 0.0 + for pass_id in range(args.pass_num): + if iters == args.iterations: + break + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if not args.use_fake_data: + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_cost, accuracy], + feed_dict={ + images: images_data, + labels: labels_data, + is_training: True + }) + iters += 1 + train_accs.append(acc) + train_losses.append(loss) + num_samples += len(data) + print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" % + (pass_id, iters, loss, acc)) + + train_elapsed = time.time() - start_time + print("Pass=%d, Loss=%f, Accuray=%f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + # evaluation + if args.with_test: + test() + + if not args.with_test: + duration = time.time() - start_time + examples_per_sec = num_samples / duration + sec_per_batch = duration / (iters - args.skip_batch_num) + + print('Total examples: %d, total time: %.5f' % + (num_samples, duration)) + print('%.5f examples/sec, %.5f sec/batch' % + (examples_per_sec, sec_per_batch)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if tf.test.is_built_with_cuda(): + device = '/device:GPU:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + data_format = 'channels_first' + else: + device = '/cpu:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + raise ValueError('Only support NHWC order in CPU mode') + + run_benchmark(args, data_format, device) diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a --- /dev/null +++ b/benchmark/tensorflow/stacked_dynamic_lstm.py @@ -0,0 +1,220 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import tensorflow as tf + +import paddle.v2 as paddle + + +def parse_args(): + parser = argparse.ArgumentParser("LSTM model benchmark.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--stacked_num', + type=int, + default=5, + help='Number of lstm layers to stack. (default: %(default)d)') + parser.add_argument( + '--embedding_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=10, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--learning_rate', + type=float, + default=0.0002, + help='Learning rate used to train. (default: %(default)f)') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + args = parser.parse_args() + return args + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def dynamic_lstm_model(dict_size, + embedding_dim, + hidden_dim, + stacked_num, + class_num=2, + is_train=True): + word_idx = tf.placeholder(tf.int64, shape=[None, None]) + sequence_length = tf.placeholder(tf.int64, shape=[None, ]) + + embedding_weights = tf.get_variable('word_embeddings', + [dict_size, embedding_dim]) + embedding = tf.nn.embedding_lookup(embedding_weights, word_idx) + + lstm_cell = tf.nn.rnn_cell.LSTMCell( + num_units=hidden_dim, use_peepholes=False) + stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num) + + # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples + _, final_state = tf.nn.dynamic_rnn( + cell=stacked_cell, + inputs=embedding, + dtype=tf.float32, + sequence_length=sequence_length) + + w = tf.Variable( + tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32) + bias = tf.Variable( + tf.constant( + value=0.0, shape=[class_num], dtype=tf.float32)) + prediction = tf.matmul(final_state[-1][1], w) + bias + + if not is_train: + return (word_idx, sequence_length), tf.nn.softmax(prediction) + + label = tf.placeholder(tf.int64, shape=[None, ]) + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=tf.one_hot(label, 2), logits=prediction) + avg_loss = tf.reduce_mean(loss) + + correct_count = tf.equal(tf.argmax(prediction, 1), label) + acc = tf.reduce_mean(tf.cast(correct_count, tf.float32)) + + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + reset_op = tf.variables_initializer(vars) + + return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def train(args): + word_dict = paddle.dataset.imdb.word_dict() + dict_size = len(word_dict) + + feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model( + dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num) + + adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + train_op = adam_optimizer.minimize(avg_loss) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=args.batch_size) + + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(word_dict), buf_size=25000), + batch_size=args.batch_size) + + def do_validation(sess): + sess.run(reset_op) + for batch_id, data in enumerate(test_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + return fetch_g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + + for pass_id in xrange(args.pass_num): + # clear accuracy local variable + sess.run(reset_op) + pass_start_time = time.time() + words_seen = 0 + + for batch_id, data in enumerate(train_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + words_seen += np.sum(sequence_length) + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f" + % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1])) + + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + test_acc = do_validation(sess) + print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_acc, words_per_sec, time_consumed)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + + if args.infer_only: + pass + else: + train(args) diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458 --- /dev/null +++ b/benchmark/tensorflow/vgg.py @@ -0,0 +1,324 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in TensorFlow""" +import tensorflow as tf +import paddle.v2 as paddle +import numpy as np +import argparse +import time + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + '--data_format', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, NCHW=[batch, channels, height, width].' + 'Only support NHWC right now.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +class VGG16Model(object): + def __init__(self): + self.parameters = [] + + def batch_norm_relu(self, inputs, is_training): + """Performs a batch normalization followed by a ReLU.""" + # We set fused=True for a significant speed boost. See + # https://www.tensorflow.org/speed/speed_guide#common_fused_ops + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if args.data_format == 'NCHW' else -1, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + inputs = tf.nn.relu(inputs) + return inputs + + def conv_bn_layer(self, + name, + images, + kernel_shape, + is_training, + drop_rate=0.0): + with tf.name_scope(name) as scope: + kernel = tf.Variable( + tf.truncated_normal( + kernel_shape, dtype=tf.float32, stddev=1e-1), + name='weights') + conv = tf.nn.conv2d( + images, + kernel, [1, 1, 1, 1], + data_format=args.data_format, + padding='SAME') + biases = tf.Variable( + tf.constant( + 0.0, shape=[kernel_shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(conv, biases) + out = self.batch_norm_relu(out, is_training) + out = tf.layers.dropout(out, rate=drop_rate, training=is_training) + return out + + def fc_layer(self, name, inputs, shape): + with tf.name_scope(name) as scope: + fc_w = tf.Variable( + tf.truncated_normal( + shape, dtype=tf.float32, stddev=1e-1), + name='weights') + fc_b = tf.Variable( + tf.constant( + 0.0, shape=[shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b) + return out + + def network(self, images, class_dim, is_training): + """ VGG16 model structure. + + TODO(kuke): enable this network to support the 'NCHW' data format + """ + + # conv1 + conv1_1 = self.conv_bn_layer( + 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3) + conv1_2 = self.conv_bn_layer( + 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0) + # pool1 + pool1 = tf.nn.max_pool( + conv1_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool1') + # conv2 + conv2_1 = self.conv_bn_layer( + 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4) + conv2_2 = self.conv_bn_layer( + 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0) + # pool2 + pool2 = tf.nn.max_pool( + conv2_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool2') + # conv3 + conv3_1 = self.conv_bn_layer( + 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4) + conv3_2 = self.conv_bn_layer( + 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4) + conv3_3 = self.conv_bn_layer( + 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0) + # pool3 + pool3 = tf.nn.max_pool( + conv3_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool3') + # conv4 + conv4_1 = self.conv_bn_layer( + 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4) + conv4_2 = self.conv_bn_layer( + 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv4_3 = self.conv_bn_layer( + 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool4 + pool4 = tf.nn.max_pool( + conv4_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # conv5 + conv5_1 = self.conv_bn_layer( + 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_2 = self.conv_bn_layer( + 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_3 = self.conv_bn_layer( + 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool5 + pool5 = tf.nn.max_pool( + conv5_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # flatten + shape = int(np.prod(pool5.get_shape()[1:])) + pool5_flat = tf.reshape(pool5, [-1, shape]) + # fc1 + drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training) + fc1 = self.fc_layer('fc1', drop, [shape, 512]) + # fc2 + bn = self.batch_norm_relu(fc1, is_training) + drop = tf.layers.dropout(bn, rate=0.5, training=is_training) + fc2 = self.fc_layer('fc2', drop, [512, 512]) + + fc3 = self.fc_layer('fc3', fc2, [512, class_dim]) + + return fc3 + + +def run_benchmark(): + """Run benchmark on cifar10 or flowers.""" + + if args.data_set == "cifar10": + class_dim = 10 + raw_shape = (3, 32, 32) + dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else ( + None, 3, 32, 32) + else: + class_dim = 102 + raw_shape = (3, 224, 224) + dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else ( + None, 3, 224, 224) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + + with tf.device(device): + images = tf.placeholder(tf.float32, shape=dat_shape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + vgg16 = VGG16Model() + logits = vgg16.network(images, class_dim, is_training) + loss = tf.losses.softmax_cross_entropy( + onehot_labels=onehot_labels, logits=logits) + avg_loss = tf.reduce_mean(loss) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_loss) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + buf_size=5120), + batch_size=args.batch_size) + + # test + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + return np.mean(test_accs) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.num_passes): + # train + num_samples = 0 + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + train_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + train_labels = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_loss, accuracy], + feed_dict={ + images: train_images, + labels: train_labels, + is_training: True + }) + iters += 1 + num_samples += len(data) + print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc)) + train_elapsed = time.time() - start_time + # test + pass_test_acc = test() + print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, pass_test_acc)) + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + print_arguments() + run_benchmark() diff --git a/paddle/fluid/platform/call_once.h b/paddle/fluid/platform/call_once.h deleted file mode 100644 index fa34972c38d6e7f77a7e178d68592f9886748fa1..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/call_once.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle { -namespace platform { - -/* - The current implementation of std::call_once has a bug described in - https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call. - This is likely caused by a deeper bug of pthread_once, which is discussed in - https://patchwork.ozlabs.org/patch/482350/ - - This wrap is a hack to avoid this bug. -*/ -template -inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { - bool good = true; - std::exception ex; - try { - std::call_once(flag, - [&](Args&&... args) { - try { - f(args...); - } catch (const std::exception& e) { - ex = e; - good = false; - } catch (...) { - ex = std::runtime_error("excption caught in call_once"); - good = false; - } - }, - args...); - } catch (std::system_error& x) { - throw std::runtime_error("call once failed"); - } - if (!good) { - throw std::exception(ex); - } -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index d21e29df3cf9b2d78920d8bac41209d200b5ba3a..c5a10a78a4f432b431680c089f255fea777277cb 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -18,7 +18,6 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/call_once.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" namespace paddle {