提交 9f462e43 编写于 作者: _青葱's avatar _青葱

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into initialize

Merge branch develop
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.python.ops import array_ops
from tensorflow.python.util import nest
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
import numpy as np
import os
import argparse
import time
import paddle.v2 as paddle
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
type=int,
default=512,
help="The dimension of embedding table. (default: %(default)d)")
parser.add_argument(
"--encoder_size",
type=int,
default=512,
help="The size of encoder bi-rnn unit. (default: %(default)d)")
parser.add_argument(
"--decoder_size",
type=int,
default=512,
help="The size of decoder rnn unit. (default: %(default)d)")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
"--dict_size",
type=int,
default=30000,
help="The dictionary capacity. Dictionaries of source sequence and "
"target dictionary have same capacity. (default: %(default)d)")
parser.add_argument(
"--max_time_steps",
type=int,
default=81,
help="Max number of time steps for sequence. (default: %(default)d)")
parser.add_argument(
"--pass_num",
type=int,
default=10,
help="The pass number to train. (default: %(default)d)")
parser.add_argument(
"--learning_rate",
type=float,
default=0.0002,
help="Learning rate used to train the model. (default: %(default)f)")
parser.add_argument(
"--infer_only", action='store_true', help="If set, run forward only.")
parser.add_argument(
"--beam_size",
type=int,
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--max_generation_length",
type=int,
default=250,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
"--save_freq",
type=int,
default=500,
help="Save model checkpoint every this interation. (default: %(default)d)")
parser.add_argument(
"--model_dir",
type=str,
default='./checkpoint',
help="Path to save model checkpoints. (default: %(default)d)")
_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name
START_TOKEN_IDX = 0
END_TOKEN_IDX = 1
class LSTMCellWithSimpleAttention(RNNCell):
"""Add attention mechanism to BasicLSTMCell.
This class is a wrapper based on tensorflow's `BasicLSTMCell`.
"""
def __init__(self,
num_units,
encoder_vector,
encoder_proj,
source_sequence_length,
forget_bias=1.0,
state_is_tuple=True,
activation=None,
reuse=None):
super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
if not state_is_tuple:
logging.warn("%s: Using a concatenated state is slower and will "
"soon be deprecated. Use state_is_tuple=True.", self)
self._num_units = num_units
# set padding part to 0
self._encoder_vector = self._reset_padding(encoder_vector,
source_sequence_length)
self._encoder_proj = self._reset_padding(encoder_proj,
source_sequence_length)
self._forget_bias = forget_bias
self._state_is_tuple = state_is_tuple
self._activation = activation or math_ops.tanh
self._linear = None
@property
def state_size(self):
return (LSTMStateTuple(self._num_units, self._num_units) \
if self._state_is_tuple else 2 * self._num_units)
@property
def output_size(self):
return self._num_units
def zero_state(self, batch_size, dtype):
state_size = self.state_size
if hasattr(self, "_last_zero_state"):
(last_state_size, last_batch_size, last_dtype,
last_output) = getattr(self, "_last_zero_state")
if (last_batch_size == batch_size and last_dtype == dtype and
last_state_size == state_size):
return last_output
with ops.name_scope(
type(self).__name__ + "ZeroState", values=[batch_size]):
output = _zero_state_tensors(state_size, batch_size, dtype)
self._last_zero_state = (state_size, batch_size, dtype, output)
return output
def call(self, inputs, state):
sigmoid = math_ops.sigmoid
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
# get context from encoder outputs
context = self._simple_attention(self._encoder_vector,
self._encoder_proj, h)
if self._linear is None:
self._linear = _Linear([inputs, context, h], 4 * self._num_units,
True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(
value=self._linear([inputs, context, h]),
num_or_size_splits=4,
axis=1)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat([new_c, new_h], 1)
return new_h, new_state
def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
"""Implement the attention function.
The implementation has the same logic to the fluid decoder.
"""
decoder_state_proj = tf.contrib.layers.fully_connected(
inputs=decoder_state,
num_outputs=self._num_units,
activation_fn=None,
biases_initializer=None)
decoder_state_expand = tf.tile(
tf.expand_dims(
input=decoder_state_proj, axis=1),
[1, tf.shape(encoder_proj)[1], 1])
concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
# need reduce the first dimension
attention_weights = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
concated, shape=[-1, self._num_units * 2]),
num_outputs=1,
activation_fn=tf.nn.tanh,
biases_initializer=None)
attention_weights_reshaped = tf.reshape(
attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
# normalize the attention weights using softmax
attention_weights_normed = tf.nn.softmax(
attention_weights_reshaped, dim=1)
scaled = tf.multiply(attention_weights_normed, encoder_vec)
context = tf.reduce_sum(scaled, axis=1)
return context
def _reset_padding(self,
memory,
memory_sequence_length,
check_inner_dims_defined=True):
"""Reset the padding part for encoder inputs.
This funtion comes from tensorflow's `_prepare_memory` function.
"""
memory = nest.map_structure(
lambda m: ops.convert_to_tensor(m, name="memory"), memory)
if memory_sequence_length is not None:
memory_sequence_length = ops.convert_to_tensor(
memory_sequence_length, name="memory_sequence_length")
if check_inner_dims_defined:
def _check_dims(m):
if not m.get_shape()[2:].is_fully_defined():
raise ValueError(
"Expected memory %s to have fully defined inner dims, "
"but saw shape: %s" % (m.name, m.get_shape()))
nest.map_structure(_check_dims, memory)
if memory_sequence_length is None:
seq_len_mask = None
else:
seq_len_mask = array_ops.sequence_mask(
memory_sequence_length,
maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
dtype=nest.flatten(memory)[0].dtype)
seq_len_batch_size = (memory_sequence_length.shape[0].value or
array_ops.shape(memory_sequence_length)[0])
def _maybe_mask(m, seq_len_mask):
rank = m.get_shape().ndims
rank = rank if rank is not None else array_ops.rank(m)
extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
if memory_sequence_length is not None:
message = ("memory_sequence_length and memory tensor "
"batch sizes do not match.")
with ops.control_dependencies([
check_ops.assert_equal(
seq_len_batch_size, m_batch_size, message=message)
]):
seq_len_mask = array_ops.reshape(
seq_len_mask,
array_ops.concat(
(array_ops.shape(seq_len_mask), extra_ones), 0))
return m * seq_len_mask
else:
return m
return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
memory)
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size,
max_generation_length):
src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
src_embedding_weights = tf.get_variable("source_word_embeddings",
[source_dict_dim, embedding_dim])
src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
# no peephole
encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=src_forward_cell,
cell_bw=src_reversed_cell,
inputs=src_embedding,
sequence_length=src_sequence_length,
dtype=tf.float32)
# concat the forward outputs and backward outputs
encoded_vec = tf.concat(encoder_outputs, axis=2)
# project the encoder outputs to size of decoder lstm
encoded_proj = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
encoded_vec, shape=[-1, embedding_dim * 2]),
num_outputs=decoder_size,
activation_fn=None,
biases_initializer=None)
encoded_proj_reshape = tf.reshape(
encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
# get init state for decoder lstm's H
backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
decoder_boot = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
backword_first, shape=[-1, embedding_dim]),
num_outputs=decoder_size,
activation_fn=tf.nn.tanh,
biases_initializer=None)
# prepare the initial state for decoder lstm
cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
initial_state = LSTMStateTuple(cell_init, decoder_boot)
# create decoder lstm cell
decoder_cell = LSTMCellWithSimpleAttention(
decoder_size,
encoded_vec
if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
encoded_proj_reshape if not is_generating else
seq2seq.tile_batch(encoded_proj_reshape, beam_size),
src_sequence_length if not is_generating else
seq2seq.tile_batch(src_sequence_length, beam_size),
forget_bias=0.0)
output_layer = Dense(target_dict_dim, name='output_projection')
if not is_generating:
trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
trg_word_idx)
training_helper = seq2seq.TrainingHelper(
inputs=trg_embedding,
sequence_length=trg_sequence_length,
time_major=False,
name='training_helper')
training_decoder = seq2seq.BasicDecoder(
cell=decoder_cell,
helper=training_helper,
initial_state=initial_state,
output_layer=output_layer)
# get the max length of target sequence
max_decoder_length = tf.reduce_max(trg_sequence_length)
decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
decoder=training_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_decoder_length)
decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
decoder_pred_train = tf.argmax(
decoder_logits_train, axis=-1, name='decoder_pred_train')
masks = tf.sequence_mask(
lengths=trg_sequence_length,
maxlen=max_decoder_length,
dtype=tf.float32,
name='masks')
# place holder of label sequence
lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
# compute the loss
loss = seq2seq.sequence_loss(
logits=decoder_logits_train,
targets=lbl_word_idx,
weights=masks,
average_across_timesteps=True,
average_across_batch=True)
# return feeding list and loss operator
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_word_idx,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_word_idx
}, loss
else:
start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
tf.int32) * START_TOKEN_IDX
# share the same embedding weights with target word
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
inference_decoder = beam_search_decoder.BeamSearchDecoder(
cell=decoder_cell,
embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
start_tokens=start_tokens,
end_token=END_TOKEN_IDX,
initial_state=tf.nn.rnn_cell.LSTMStateTuple(
tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
beam_width=beam_size,
output_layer=output_layer)
decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
decoder=inference_decoder,
output_time_major=False,
#impute_finished=True,# error occurs
maximum_iterations=max_generation_length)
predicted_ids = decoder_outputs_decode.predicted_ids
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length
}, predicted_ids
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in vars(args).iteritems():
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def save(sess, path, var_list=None, global_step=None):
saver = tf.train.Saver(var_list)
save_path = saver.save(sess, save_path=path, global_step=global_step)
print('Model save at %s' % save_path)
def restore(sess, path, var_list=None):
# var_list = None returns the list of all saveable variables
saver = tf.train.Saver(var_list)
saver.restore(sess, save_path=path)
print('model restored from %s' % path)
def adapt_batch_data(data):
src_seq = map(lambda x: x[0], data)
trg_seq = map(lambda x: x[1], data)
lbl_seq = map(lambda x: x[2], data)
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
trg_sequence_length = np.array(
[len(seq) for seq in trg_seq]).astype('int32')
trg_seq_maxlen = np.max(trg_sequence_length)
src_seq = np.array(
[padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq]).astype('int32')
trg_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in trg_seq]).astype('int32')
lbl_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in lbl_seq]).astype('int32')
return {
'src_word_idx': src_seq,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_seq,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_seq
}
def train():
feeding_dict, loss = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=False,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
global_step = tf.Variable(0, trainable=False, name='global_step')
trainable_params = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
gradients = tf.gradients(loss, trainable_params)
# may clip the parameters
clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
updates = optimizer.apply_gradients(
zip(gradients, trainable_params), global_step=global_step)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
def do_validataion():
total_loss = 0.0
count = 0
for batch_id, data in enumerate(test_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
outputs = sess.run([loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
total_loss += outputs[0]
count += 1
return total_loss / count
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
words_seen += np.sum(adapted_batch_data['src_sequence_length'])
words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
outputs = sess.run([updates, loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
print("pass_id=%d, batch_id=%d, train_loss: %f" %
(pass_id, batch_id, outputs[1]))
pass_end_time = time.time()
test_loss = do_validataion()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
def infer():
feeding_dict, predicted_ids = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=True,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
with tf.Session(config=config) as sess:
restore(sess, './checkpoint/tf_seq2seq-1500')
for batch_id, data in enumerate(test_batch_generator()):
src_seq = map(lambda x: x[0], data)
source_language_seq = [
src_dict[item] for seq in src_seq for item in seq
]
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
src_seq = np.array([
padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq
]).astype('int32')
outputs = sess.run([predicted_ids],
feed_dict={
feeding_dict['src_word_idx']: src_seq,
feeding_dict['src_sequence_length']:
src_sequence_length
})
print("\nDecoder result comparison: ")
source_language_seq = ' '.join(source_language_seq).lstrip(
'<s>').rstrip('<e>').strip()
inference_seq = ''
print(" --> source: " + source_language_seq)
for item in outputs[0][0]:
if item[0] == END_TOKEN_IDX: break
inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
print(" --> inference: " + inference_seq)
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
if args.infer_only:
infer()
else:
train()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=5, help='The number of passes.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
args = parser.parse_args()
return args
def run_benchmark(args):
def weight_variable(dtype, shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
return tf.Variable(initial)
def bias_variable(dtype, shape):
initial = tf.constant(0.1, shape=shape, dtype=dtype)
return tf.Variable(initial)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
labels = tf.placeholder(tf.int64, shape=(None, ))
# conv1, relu, pool1
conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
conv1_bias = bias_variable(DTYPE, [20])
conv1 = tf.nn.conv2d(
images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
pool1 = tf.nn.max_pool(
relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# conv2, relu, pool2
conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
conv2_bias = bias_variable(DTYPE, [50])
conv2 = tf.nn.conv2d(
pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
pool2 = tf.nn.max_pool(
relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# FC
pool_shape = pool2.get_shape().as_list()
hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
fc_bias = bias_variable(DTYPE, [10])
logits = tf.matmul(reshape, fc_weights) + fc_bias
# Get prediction
prediction = tf.nn.softmax(logits)
# Loss
one_hot_labels = tf.one_hot(labels, depth=10)
cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
avg_cost = tf.reduce_mean(cost)
# Get accuracy
correct = tf.equal(tf.argmax(prediction, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# metrics, g_accuracy
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_accuracy = tf.metrics.accuracy(
labels, tf.argmax(
prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
g_accuracy_reset_op = tf.variables_initializer(vars)
# Optimizer
opt = tf.train.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
train_op = opt.minimize(avg_cost)
# train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
def eval_test():
sess.run(g_accuracy_reset_op)
for batch_id, data in enumerate(test_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
loss, acc, g_acc = sess.run(
[avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
return g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
for pass_id in range(args.pass_num):
sess.run(g_accuracy_reset_op)
pass_start = time.time()
for batch_id, data in enumerate(train_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
"int64")
start = time.time()
_, loss, acc, g_acc = sess.run(
[train_op, avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
end = time.time()
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
pass_end = time.time()
test_avg_acc = eval_test()
print(
"pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
% (pass_id, g_acc[1], test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
run_benchmark(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
Get help: python resnet.py --help
See performance on flowers: python resnet.py
Train on cifar10: python resnet.py --data=cifar10 --with_test
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import paddle.v2 as paddle
import tensorflow as tf
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser('Convolution model benchmark.')
parser.add_argument(
'--model',
type=str,
choices=['resnet'],
default='resnet',
help='The model architecture.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='use real data or fake data')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations',
type=int,
default=105,
help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=300, help='The number of passes.')
parser.add_argument(
'--order',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--data',
type=str,
default='flowers102',
choices=['flowers102', 'cifar10'],
help='The kinds of data.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
'with_test'] else vars(args)['iterations']
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def fixed_padding(inputs, kernel_size, data_format):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
[pad_beg, pad_end]])
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
"""Strided 2-D convolution with explicit padding."""
# The padding is consistent and is based only on `kernel_size`, not on the
# dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
# This is consistent with PaddlePaddle.
# In addition, the calculation for output size in TensorFlow can refer:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format)
return tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def conv_bn(inputs,
filters,
kernel_size,
strides,
is_training,
data_format,
act=True):
# def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
# set fused=True for a significant performance boost. See
# https://www.tensorflow.org/performance/performance_guide#common_fused_ops
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
data_format=data_format)
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if data_format == 'channels_first' else 3,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
if act:
inputs = tf.nn.relu(inputs)
return inputs
def basicblock(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = conv_bn(
inputs, filters * 4, 1, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
data_format):
# Bottleneck blocks end with 4x the number of filters as they start with
filters_out = 4 * filters if block_fn is bottleneck else filters
def projection_shortcut(inputs):
return conv2d_fixed_padding(
inputs=inputs,
filters=filters_out,
kernel_size=1,
strides=strides,
data_format=data_format)
# Only the first block per block_layer uses projection_shortcut and strides
inputs = block_fn(inputs, filters, is_training, projection_shortcut,
strides, data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
return tf.identity(inputs, name)
def resnet_imagenet(depth, class_dim, data_format):
"""Returns the ResNet model for a given size and number of output classes."""
def resnet_generator(block_fn,
layers,
num_classes,
data_format='channels_last'):
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
"""Constructs the ResNet model given the inputs."""
if data_format == 'channels_first':
# Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
# This provides a large performance boost on GPU. See
# https://www.tensorflow.org/performance/performance_guide#data_formats
inputs = tf.transpose(inputs, [0, 3, 1, 2])
inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = tf.layers.max_pooling2d(
inputs=inputs,
pool_size=3,
strides=2,
padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
is_training, 'block_layer1', data_format)
inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
is_training, 'block_layer2', data_format)
inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
is_training, 'block_layer3', data_format)
inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
is_training, 'block_layer4', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=7,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs,
[-1, 512 if block_fn is basicblock else 2048])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
model_params = {
18: {
'block': basicblock,
'layers': [2, 2, 2, 2]
},
34: {
'block': basicblock,
'layers': [3, 4, 6, 3]
},
50: {
'block': bottleneck,
'layers': [3, 4, 6, 3]
},
101: {
'block': bottleneck,
'layers': [3, 4, 23, 3]
},
152: {
'block': bottleneck,
'layers': [3, 8, 36, 3]
},
200: {
'block': bottleneck,
'layers': [3, 24, 36, 3]
}
}
if depth not in model_params:
raise ValueError('Not a valid depth:', depth)
params = model_params[depth]
return resnet_generator(params['block'], params['layers'], class_dim,
data_format)
def resnet_cifar10(depth, num_classes, data_format):
if depth % 6 != 2:
raise ValueError('depth must be 6n + 2:', depth)
num_blocks = (depth - 2) // 6
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
'block_layer1', data_format)
inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
'block_layer2', data_format)
inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
'block_layer3', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=8,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs, [-1, 64])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
"""Our model_fn for ResNet to be used with our Estimator."""
class_dim = 1000
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
if args.data == 'flowers102':
class_dim = 102
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
elif args.data == 'cifar10':
class_dim = 10
dshape = (None, 32, 32, 3)
pdshape = (3, 32, 32)
with tf.device(device):
images = tf.placeholder(DTYPE, shape=dshape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
network = resnet_cifar10(
32, class_dim,
data_format) if args.data == 'cifar10' else resnet_imagenet(
50, class_dim, data_format)
logits = network(inputs=images, is_training=is_training)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=onehot_labels)
avg_cost = tf.reduce_mean(cross_entropy)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
lr = 0.1 if args.data == 'cifar10' else 0.01
optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=100)
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, np.mean(test_accs)))
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
if args.use_fake_data:
data = train_reader().next()
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.pass_num):
if iters == args.iterations:
break
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
if not args.use_fake_data:
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_cost, accuracy],
feed_dict={
images: images_data,
labels: labels_data,
is_training: True
})
iters += 1
train_accs.append(acc)
train_losses.append(loss)
num_samples += len(data)
print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
print("Pass=%d, Loss=%f, Accuray=%f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
# evaluation
if args.with_test:
test()
if not args.with_test:
duration = time.time() - start_time
examples_per_sec = num_samples / duration
sec_per_batch = duration / (iters - args.skip_batch_num)
print('Total examples: %d, total time: %.5f' %
(num_samples, duration))
print('%.5f examples/sec, %.5f sec/batch' %
(examples_per_sec, sec_per_batch))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if tf.test.is_built_with_cuda():
device = '/device:GPU:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
data_format = 'channels_first'
else:
device = '/cpu:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
raise ValueError('Only support NHWC order in CPU mode')
run_benchmark(args, data_format, device)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import tensorflow as tf
import paddle.v2 as paddle
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--embedding_dim',
type=int,
default=512,
help='Dimension of embedding table. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=512,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=10,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.0002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstm_model(dict_size,
embedding_dim,
hidden_dim,
stacked_num,
class_num=2,
is_train=True):
word_idx = tf.placeholder(tf.int64, shape=[None, None])
sequence_length = tf.placeholder(tf.int64, shape=[None, ])
embedding_weights = tf.get_variable('word_embeddings',
[dict_size, embedding_dim])
embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
lstm_cell = tf.nn.rnn_cell.LSTMCell(
num_units=hidden_dim, use_peepholes=False)
stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
# final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
_, final_state = tf.nn.dynamic_rnn(
cell=stacked_cell,
inputs=embedding,
dtype=tf.float32,
sequence_length=sequence_length)
w = tf.Variable(
tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
bias = tf.Variable(
tf.constant(
value=0.0, shape=[class_num], dtype=tf.float32))
prediction = tf.matmul(final_state[-1][1], w) + bias
if not is_train:
return (word_idx, sequence_length), tf.nn.softmax(prediction)
label = tf.placeholder(tf.int64, shape=[None, ])
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(label, 2), logits=prediction)
avg_loss = tf.reduce_mean(loss)
correct_count = tf.equal(tf.argmax(prediction, 1), label)
acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
reset_op = tf.variables_initializer(vars)
return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def train(args):
word_dict = paddle.dataset.imdb.word_dict()
dict_size = len(word_dict)
feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
train_op = adam_optimizer.minimize(avg_loss)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=args.batch_size)
def do_validation(sess):
sess.run(reset_op)
for batch_id, data in enumerate(test_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
return fetch_g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
# clear accuracy local variable
sess.run(reset_op)
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
words_seen += np.sum(sequence_length)
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
% (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
test_acc = do_validation(sess)
print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_acc, words_per_sec, time_consumed))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
train(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark():
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss)
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
num_samples += len(data)
print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
run_benchmark()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mutex>
namespace paddle {
namespace platform {
/*
The current implementation of std::call_once has a bug described in
https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
This is likely caused by a deeper bug of pthread_once, which is discussed in
https://patchwork.ozlabs.org/patch/482350/
This wrap is a hack to avoid this bug.
*/
template <typename Callable, typename... Args>
inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
bool good = true;
std::exception ex;
try {
std::call_once(flag,
[&](Args&&... args) {
try {
f(args...);
} catch (const std::exception& e) {
ex = e;
good = false;
} catch (...) {
ex = std::runtime_error("excption caught in call_once");
good = false;
}
},
args...);
} catch (std::system_error& x) {
throw std::runtime_error("call once failed");
}
if (!good) {
throw std::exception(ex);
}
}
} // namespace platform
} // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/call_once.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册