提交 75a2e505 编写于 作者: J JiayiFeng

Merge branch 'modify_dev' into modify_readers_to_fit_parallel_executor

...@@ -48,6 +48,13 @@ parser.add_argument( ...@@ -48,6 +48,13 @@ parser.add_argument(
type=int, type=int,
default=16, default=16,
help="The sequence number of a mini-batch data. (default: %(default)d)") help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
"--dict_size", "--dict_size",
type=int, type=int,
...@@ -72,16 +79,21 @@ parser.add_argument( ...@@ -72,16 +79,21 @@ parser.add_argument(
default=3, default=3,
help="The width for beam searching. (default: %(default)d)") help="The width for beam searching. (default: %(default)d)")
parser.add_argument( parser.add_argument(
"--use_gpu", '--device',
type=distutils.util.strtobool, type=str,
default=True, default='GPU',
help="Whether to use gpu. (default: %(default)d)") choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument( parser.add_argument(
"--max_length", "--max_length",
type=int, type=int,
default=250, default=250,
help="The maximum length of sequence when doing generation. " help="The maximum length of sequence when doing generation. "
"(default: %(default)d)") "(default: %(default)d)")
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
...@@ -281,7 +293,7 @@ def train(): ...@@ -281,7 +293,7 @@ def train():
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size) batch_size=args.batch_size)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -307,14 +319,20 @@ def train(): ...@@ -307,14 +319,20 @@ def train():
return total_loss / count return total_loss / count
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in xrange(args.pass_num): for pass_id in xrange(args.pass_num):
pass_start_time = time.time() train_accs = []
words_seen = 0 train_losses = []
for batch_id, data in enumerate(train_batch_generator()): for batch_id, data in enumerate(train_batch_generator()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
words_seen += word_num num_samples += word_num
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
words_seen += word_num num_samples += word_num
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
fetch_outs = exe.run(framework.default_main_program(), fetch_outs = exe.run(framework.default_main_program(),
...@@ -325,24 +343,36 @@ def train(): ...@@ -325,24 +343,36 @@ def train():
}, },
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(fetch_outs[0]) iters += 1
print('pass_id=%d, batch_id=%d, train_loss: %f' % loss = np.array(fetch_outs[0])
(pass_id, batch_id, avg_cost_val)) print(
"Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
pass_end_time = time.time() ) # The accuracy is the accumulation of batches, but not the current batch.
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_loss = do_validation() test_loss = do_validation()
time_consumed = pass_end_time - pass_start_time exit(0)
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
def infer(): def infer():
pass pass
def print_arguments(args):
print('----------- seq2seq Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args)
if args.infer_only: if args.infer_only:
infer() infer()
else: else:
......
...@@ -35,6 +35,12 @@ def parse_args(): ...@@ -35,6 +35,12 @@ def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.") parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.') '--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument( parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.') '--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
...@@ -53,19 +59,14 @@ def parse_args(): ...@@ -53,19 +59,14 @@ def parse_args():
'--use_nvprof', '--use_nvprof',
action='store_true', action='store_true',
help='If set, use nvprof for CUDA.') help='If set, use nvprof for CUDA.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def cnn_model(data): def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool( conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data, input=data,
...@@ -161,16 +162,22 @@ def run_benchmark(model, args): ...@@ -161,16 +162,22 @@ def run_benchmark(model, args):
paddle.dataset.mnist.train(), batch_size=args.batch_size) paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage() accuracy = fluid.average.WeightedAverage()
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
accuracy.reset() accuracy.reset()
pass_start = time.time() train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
img_data = np.array( img_data = np.array(
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([len(y_data), 1]) y_data = y_data.reshape([len(y_data), 1])
start = time.time()
outs = exe.run( outs = exe.run(
fluid.default_main_program(), fluid.default_main_program(),
feed={"pixel": img_data, feed={"pixel": img_data,
...@@ -178,21 +185,36 @@ def run_benchmark(model, args): ...@@ -178,21 +185,36 @@ def run_benchmark(model, args):
fetch_list=[avg_cost, batch_acc, batch_size_tensor] fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2]) accuracy.add(value=outs[1], weight=outs[2])
end = time.time() iters += 1
num_samples += len(y_data)
loss = np.array(outs[0]) loss = np.array(outs[0])
acc = np.array(outs[1]) acc = np.array(outs[1])
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % train_losses.append(loss)
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
pass_end = time.time() (pass_id, iters, loss, acc))
train_avg_acc = accuracy.eval() print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program) inference_program)
exit(0)
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
(pass_id, train_avg_acc, test_avg_acc, def print_arguments(args):
(pass_end - pass_start) / 1000)) vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- mnist Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -87,15 +87,6 @@ def parse_args(): ...@@ -87,15 +87,6 @@ def parse_args():
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
conv1 = fluid.layers.conv2d( conv1 = fluid.layers.conv2d(
input=input, input=input,
...@@ -279,32 +270,31 @@ def run_benchmark(model, args): ...@@ -279,32 +270,31 @@ def run_benchmark(model, args):
'label': label}, 'label': label},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
iters += 1 iters += 1
num_samples += label[0] num_samples += len(label)
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc)) (pass_id, iters, loss, acc))
pass_train_acc = accuracy.eval()
# evaluation
if args.with_test:
pass_test_acc = test(exe)
train_elapsed = time.time() - start_time
print("Pass: %d, Loss: %f, Train Accuray: %f\n" % print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs))) (pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec)) (num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
pass_test_acc = test(exe)
exit(0)
if args.use_cprof:
pr.disable() def print_arguments(args):
s = StringIO.StringIO() vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
sortby = 'cumulative' vars(args)['device'] == 'GPU')
ps = pstats.Stats(pr, stream=s).sort_stats(sortby) print('----------- resnet Configuration Arguments -----------')
ps.print_stats() for arg, value in sorted(vars(args).iteritems()):
print(s.getvalue()) print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
#!/bin/bash #!/bin/bash
# This script benchmarking the PaddlePaddle Fluid on # This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU. # single thread single GPU.
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
# disable openmp and mkl parallel # disable openmp and mkl parallel
#https://github.com/PaddlePaddle/Paddle/issues/7199 #https://github.com/PaddlePaddle/Paddle/issues/7199
...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
# only query the gpu used
nohup stdbuf -oL nvidia-smi \
--id=${CUDA_VISIBLE_DEVICES} \
--query-gpu=timestamp \
--query-compute-apps=pid,process_name,used_memory \
--format=csv \
--filename=mem.log \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
2>&1 | tee -a mnist_gpu_128.log
# vgg16 # vgg16
# cifar10 gpu cifar10 128 # gpu cifar10 128
FLAGS_benchmark=true python fluid/vgg.py \ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > vgg16_gpu_128.log 2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a vgg16_gpu_flowers_32.log
# resnet50 # resnet50
# resnet50 gpu cifar10 128 # resnet50 gpu cifar10 128
FLAGS_benchmark=true python fluid/resnet.py \ FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--data_set=cifar10 \ --data_set=cifar10 \
--model=resnet_cifar10 \ --model=resnet_cifar10 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > resnet50_gpu_128.log 2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm # lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a lstm_gpu_128.log
...@@ -37,6 +37,14 @@ def parse_args(): ...@@ -37,6 +37,14 @@ def parse_args():
type=int, type=int,
default=32, default=32,
help='The sequence number of a batch data. (default: %(default)d)') help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
'--emb_dim', '--emb_dim',
type=int, type=int,
...@@ -64,6 +72,10 @@ def parse_args(): ...@@ -64,6 +72,10 @@ def parse_args():
default=int(os.environ.get('CROP_SIZE', '1500')), default=int(os.environ.get('CROP_SIZE', '1500')),
help='The max sentence length of input. Since this model use plain RNN,' help='The max sentence length of input. Since this model use plain RNN,'
' Gradient could be explored if sentence is too long') ' Gradient could be explored if sentence is too long')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -157,20 +169,23 @@ def main(): ...@@ -157,20 +169,23 @@ def main():
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
def train_loop(pass_num, crop_size):
with profiler.profiler(args.device, 'total') as prof:
for pass_id in range(pass_num):
train_reader = batch( train_reader = batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), crop_sentence(imdb.train(word_dict), args.crop_size),
buf_size=25000), buf_size=25000),
batch_size=args.batch_size) batch_size=args.batch_size)
word_nums = 0
pass_start_time = time.time() iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
tensor_words = to_lodtensor([x[0] for x in data], place) tensor_words = to_lodtensor([x[0] for x in data], place)
for x in data:
word_nums += len(x[0])
label = numpy.array([x[1] for x in data]).astype("int64") label = numpy.array([x[1] for x in data]).astype("int64")
label = label.reshape((-1, 1)) label = label.reshape((-1, 1))
loss_np, acc, weight = exe.run( loss_np, acc, weight = exe.run(
...@@ -178,16 +193,19 @@ def main(): ...@@ -178,16 +193,19 @@ def main():
feed={"words": tensor_words, feed={"words": tensor_words,
"label": label}, "label": label},
fetch_list=[loss, batch_acc, batch_size_tensor]) fetch_list=[loss, batch_acc, batch_size_tensor])
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % iters += 1
(pass_id, batch_id, loss_np, acc)) for x in data:
num_samples += len(x[0])
pass_end_time = time.time() print(
time_consumed = pass_end_time - pass_start_time "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
words_per_sec = word_nums / time_consumed (pass_id, iters, loss_np, acc)
print("pass_id=%d, sec/pass: %f, words/s: %f" % ) # The accuracy is the accumulation of batches, but not the current batch.
(pass_id, time_consumed, words_per_sec))
train_loop(args.pass_num, args.crop_size) train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
exit(0)
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -205,5 +223,14 @@ def to_lodtensor(data, place): ...@@ -205,5 +223,14 @@ def to_lodtensor(data, place):
return res return res
def print_arguments(args):
print('----------- lstm Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args()
print_arguments(args)
main() main()
...@@ -191,25 +191,29 @@ def main(): ...@@ -191,25 +191,29 @@ def main():
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
iters += 1 iters += 1
num_samples += len(data) num_samples += len(y_data)
print( print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc) (pass_id, iters, loss, acc)
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_train_acc = accuracy.eval() # pass_train_acc = accuracy.eval()
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation # evaluation
if args.with_test: if args.with_test:
pass_test_acc = test(exe) pass_test_acc = test(exe)
train_elapsed = time.time() - start_time exit(0)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
def print_arguments(): def print_arguments():
print('----------- Configuration Arguments -----------') print('----------- vgg Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()): for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value)) print('%s: %s' % (arg, value))
print('------------------------------------------------') print('------------------------------------------------')
......
此差异已折叠。
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=5, help='The number of passes.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
args = parser.parse_args()
return args
def run_benchmark(args):
def weight_variable(dtype, shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
return tf.Variable(initial)
def bias_variable(dtype, shape):
initial = tf.constant(0.1, shape=shape, dtype=dtype)
return tf.Variable(initial)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
labels = tf.placeholder(tf.int64, shape=(None, ))
# conv1, relu, pool1
conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
conv1_bias = bias_variable(DTYPE, [20])
conv1 = tf.nn.conv2d(
images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
pool1 = tf.nn.max_pool(
relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# conv2, relu, pool2
conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
conv2_bias = bias_variable(DTYPE, [50])
conv2 = tf.nn.conv2d(
pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
pool2 = tf.nn.max_pool(
relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# FC
pool_shape = pool2.get_shape().as_list()
hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
fc_bias = bias_variable(DTYPE, [10])
logits = tf.matmul(reshape, fc_weights) + fc_bias
# Get prediction
prediction = tf.nn.softmax(logits)
# Loss
one_hot_labels = tf.one_hot(labels, depth=10)
cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
avg_cost = tf.reduce_mean(cost)
# Get accuracy
correct = tf.equal(tf.argmax(prediction, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# metrics, g_accuracy
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_accuracy = tf.metrics.accuracy(
labels, tf.argmax(
prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
g_accuracy_reset_op = tf.variables_initializer(vars)
# Optimizer
opt = tf.train.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
train_op = opt.minimize(avg_cost)
# train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
def eval_test():
sess.run(g_accuracy_reset_op)
for batch_id, data in enumerate(test_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
loss, acc, g_acc = sess.run(
[avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
return g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
for pass_id in range(args.pass_num):
sess.run(g_accuracy_reset_op)
pass_start = time.time()
for batch_id, data in enumerate(train_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
"int64")
start = time.time()
_, loss, acc, g_acc = sess.run(
[train_op, avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
end = time.time()
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
pass_end = time.time()
test_avg_acc = eval_test()
print(
"pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
% (pass_id, g_acc[1], test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
run_benchmark(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
Get help: python resnet.py --help
See performance on flowers: python resnet.py
Train on cifar10: python resnet.py --data=cifar10 --with_test
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import paddle.v2 as paddle
import tensorflow as tf
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser('Convolution model benchmark.')
parser.add_argument(
'--model',
type=str,
choices=['resnet'],
default='resnet',
help='The model architecture.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='use real data or fake data')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations',
type=int,
default=105,
help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=300, help='The number of passes.')
parser.add_argument(
'--order',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--data',
type=str,
default='flowers102',
choices=['flowers102', 'cifar10'],
help='The kinds of data.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
'with_test'] else vars(args)['iterations']
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def fixed_padding(inputs, kernel_size, data_format):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
[pad_beg, pad_end]])
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
"""Strided 2-D convolution with explicit padding."""
# The padding is consistent and is based only on `kernel_size`, not on the
# dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
# This is consistent with PaddlePaddle.
# In addition, the calculation for output size in TensorFlow can refer:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format)
return tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def conv_bn(inputs,
filters,
kernel_size,
strides,
is_training,
data_format,
act=True):
# def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
# set fused=True for a significant performance boost. See
# https://www.tensorflow.org/performance/performance_guide#common_fused_ops
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
data_format=data_format)
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if data_format == 'channels_first' else 3,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
if act:
inputs = tf.nn.relu(inputs)
return inputs
def basicblock(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = conv_bn(
inputs, filters * 4, 1, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
data_format):
# Bottleneck blocks end with 4x the number of filters as they start with
filters_out = 4 * filters if block_fn is bottleneck else filters
def projection_shortcut(inputs):
return conv2d_fixed_padding(
inputs=inputs,
filters=filters_out,
kernel_size=1,
strides=strides,
data_format=data_format)
# Only the first block per block_layer uses projection_shortcut and strides
inputs = block_fn(inputs, filters, is_training, projection_shortcut,
strides, data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
return tf.identity(inputs, name)
def resnet_imagenet(depth, class_dim, data_format):
"""Returns the ResNet model for a given size and number of output classes."""
def resnet_generator(block_fn,
layers,
num_classes,
data_format='channels_last'):
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
"""Constructs the ResNet model given the inputs."""
if data_format == 'channels_first':
# Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
# This provides a large performance boost on GPU. See
# https://www.tensorflow.org/performance/performance_guide#data_formats
inputs = tf.transpose(inputs, [0, 3, 1, 2])
inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = tf.layers.max_pooling2d(
inputs=inputs,
pool_size=3,
strides=2,
padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
is_training, 'block_layer1', data_format)
inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
is_training, 'block_layer2', data_format)
inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
is_training, 'block_layer3', data_format)
inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
is_training, 'block_layer4', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=7,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs,
[-1, 512 if block_fn is basicblock else 2048])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
model_params = {
18: {
'block': basicblock,
'layers': [2, 2, 2, 2]
},
34: {
'block': basicblock,
'layers': [3, 4, 6, 3]
},
50: {
'block': bottleneck,
'layers': [3, 4, 6, 3]
},
101: {
'block': bottleneck,
'layers': [3, 4, 23, 3]
},
152: {
'block': bottleneck,
'layers': [3, 8, 36, 3]
},
200: {
'block': bottleneck,
'layers': [3, 24, 36, 3]
}
}
if depth not in model_params:
raise ValueError('Not a valid depth:', depth)
params = model_params[depth]
return resnet_generator(params['block'], params['layers'], class_dim,
data_format)
def resnet_cifar10(depth, num_classes, data_format):
if depth % 6 != 2:
raise ValueError('depth must be 6n + 2:', depth)
num_blocks = (depth - 2) // 6
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
'block_layer1', data_format)
inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
'block_layer2', data_format)
inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
'block_layer3', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=8,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs, [-1, 64])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
"""Our model_fn for ResNet to be used with our Estimator."""
class_dim = 1000
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
if args.data == 'flowers102':
class_dim = 102
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
elif args.data == 'cifar10':
class_dim = 10
dshape = (None, 32, 32, 3)
pdshape = (3, 32, 32)
with tf.device(device):
images = tf.placeholder(DTYPE, shape=dshape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
network = resnet_cifar10(
32, class_dim,
data_format) if args.data == 'cifar10' else resnet_imagenet(
50, class_dim, data_format)
logits = network(inputs=images, is_training=is_training)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=onehot_labels)
avg_cost = tf.reduce_mean(cross_entropy)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
lr = 0.1 if args.data == 'cifar10' else 0.01
optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=100)
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, np.mean(test_accs)))
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
if args.use_fake_data:
data = train_reader().next()
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.pass_num):
if iters == args.iterations:
break
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
if not args.use_fake_data:
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_cost, accuracy],
feed_dict={
images: images_data,
labels: labels_data,
is_training: True
})
iters += 1
train_accs.append(acc)
train_losses.append(loss)
num_samples += len(data)
print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
print("Pass=%d, Loss=%f, Accuray=%f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
# evaluation
if args.with_test:
test()
if not args.with_test:
duration = time.time() - start_time
examples_per_sec = num_samples / duration
sec_per_batch = duration / (iters - args.skip_batch_num)
print('Total examples: %d, total time: %.5f' %
(num_samples, duration))
print('%.5f examples/sec, %.5f sec/batch' %
(examples_per_sec, sec_per_batch))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if tf.test.is_built_with_cuda():
device = '/device:GPU:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
data_format = 'channels_first'
else:
device = '/cpu:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
raise ValueError('Only support NHWC order in CPU mode')
run_benchmark(args, data_format, device)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import tensorflow as tf
import paddle.v2 as paddle
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--embedding_dim',
type=int,
default=512,
help='Dimension of embedding table. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=512,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=10,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.0002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstm_model(dict_size,
embedding_dim,
hidden_dim,
stacked_num,
class_num=2,
is_train=True):
word_idx = tf.placeholder(tf.int64, shape=[None, None])
sequence_length = tf.placeholder(tf.int64, shape=[None, ])
embedding_weights = tf.get_variable('word_embeddings',
[dict_size, embedding_dim])
embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
lstm_cell = tf.nn.rnn_cell.LSTMCell(
num_units=hidden_dim, use_peepholes=False)
stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
# final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
_, final_state = tf.nn.dynamic_rnn(
cell=stacked_cell,
inputs=embedding,
dtype=tf.float32,
sequence_length=sequence_length)
w = tf.Variable(
tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
bias = tf.Variable(
tf.constant(
value=0.0, shape=[class_num], dtype=tf.float32))
prediction = tf.matmul(final_state[-1][1], w) + bias
if not is_train:
return (word_idx, sequence_length), tf.nn.softmax(prediction)
label = tf.placeholder(tf.int64, shape=[None, ])
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(label, 2), logits=prediction)
avg_loss = tf.reduce_mean(loss)
correct_count = tf.equal(tf.argmax(prediction, 1), label)
acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
reset_op = tf.variables_initializer(vars)
return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def train(args):
word_dict = paddle.dataset.imdb.word_dict()
dict_size = len(word_dict)
feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
train_op = adam_optimizer.minimize(avg_loss)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=args.batch_size)
def do_validation(sess):
sess.run(reset_op)
for batch_id, data in enumerate(test_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
return fetch_g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
# clear accuracy local variable
sess.run(reset_op)
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
words_seen += np.sum(sequence_length)
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
% (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
test_acc = do_validation(sess)
print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_acc, words_per_sec, time_consumed))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
train(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark():
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss)
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
num_samples += len(data)
print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
run_benchmark()
.timestamp
*.o *.o
*.a *.a
.svn .svn
......
...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name); "Tensor %s contains NAN", name);
} }
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
int block_id) {
auto& global_block = pdesc.Block(block_id);
const Scope* ancestor_scope = scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
if (ancestor_scope != scope) {
for (auto& var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
...@@ -184,8 +221,8 @@ static bool has_fetch_operators( ...@@ -184,8 +221,8 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope, void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name, bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name, bool create_vars) { const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId); platform::RecordBlock b(kProgramId);
bool has_feed_ops = bool has_feed_ops =
has_feed_operators(program.Block(0), feed_targets, feed_holder_name); has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
local_scope = &scope->NewScope(); local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
} }
CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
} }
} // if (create_local_scope)
} // if (create_vars)
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
......
...@@ -54,9 +54,9 @@ class Executor { ...@@ -54,9 +54,9 @@ class Executor {
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
bool create_vars = true,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch", const std::string& fetch_holder_name = "fetch");
bool create_vars = true);
static std::unique_ptr<ExecutorPrepareContext> Prepare( static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id); const ProgramDesc& program, int block_id);
...@@ -64,6 +64,8 @@ class Executor { ...@@ -64,6 +64,8 @@ class Executor {
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare( static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids); const ProgramDesc& program, const std::vector<int>& block_ids);
void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
......
...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iterator>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
...@@ -22,11 +27,6 @@ limitations under the License. */ ...@@ -22,11 +27,6 @@ limitations under the License. */
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h" #include "paddle/fluid/recordio/writer.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iterator>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, ...@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx); TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
} }
void WriteToRecordIO(recordio::Writer &writer, void WriteToRecordIO(recordio::Writer *writer,
const std::vector<LoDTensor> &tensor, const std::vector<LoDTensor> &tensor,
const platform::DeviceContext &dev_ctx) { const platform::DeviceContext &dev_ctx) {
std::stringstream buffer; std::stringstream buffer;
...@@ -303,19 +303,21 @@ void WriteToRecordIO(recordio::Writer &writer, ...@@ -303,19 +303,21 @@ void WriteToRecordIO(recordio::Writer &writer,
for (auto &each : tensor) { for (auto &each : tensor) {
SerializeToStream(buffer, each, dev_ctx); SerializeToStream(buffer, each, dev_ctx);
} }
writer.Write(buffer.str()); writer->Write(buffer.str());
} }
std::vector<LoDTensor> ReadFromRecordIO( std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) { recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
std::istringstream sin(scanner.Next()); std::vector<LoDTensor> result;
if (scanner->HasNext()) {
std::istringstream sin(scanner->Next());
uint32_t sz; uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t)); sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
std::vector<LoDTensor> result;
result.resize(sz); result.resize(sz);
for (uint32_t i = 0; i < sz; ++i) { for (uint32_t i = 0; i < sz; ++i) {
DeserializeFromStream(sin, &result[i], dev_ctx); DeserializeFromStream(sin, &result[i], dev_ctx);
} }
}
return result; return result;
} }
......
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
...@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor, ...@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
void DeserializeFromStream(std::istream& is, LoDTensor* tensor, void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
extern void WriteToRecordIO(recordio::Writer& writer, extern void WriteToRecordIO(recordio::Writer* writer,
const std::vector<LoDTensor>& tensor, const std::vector<LoDTensor>& tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
extern std::vector<LoDTensor> ReadFromRecordIO( extern std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx); recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -12,17 +12,17 @@ ...@@ -12,17 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) { ...@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
{ {
recordio::Writer writer(stream, recordio::Compressor::kSnappy); recordio::Writer writer(stream, recordio::Compressor::kSnappy);
WriteToRecordIO(writer, {tensor, tensor}, ctx); WriteToRecordIO(&writer, {tensor, tensor}, ctx);
WriteToRecordIO(writer, {tensor, tensor}, ctx); WriteToRecordIO(&writer, {tensor, tensor}, ctx);
writer.Flush(); writer.Flush();
} }
...@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) { ...@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
{ {
std::unique_ptr<std::istream> stream_ptr(stream); std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr)); recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(scanner, ctx); auto tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(scanner, ctx); tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/profiler.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -24,6 +23,7 @@ limitations under the License. */ ...@@ -24,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate { ...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
#endif #endif
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_;
}
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
size_t num_threads, bool use_event, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::unordered_set<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, bool allow_op_delay) const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
// Step 1. RunStartupProgram and Bcast the params to devs. // Step 1. Bcast the params to devs.
Executor exe(places[0]);
exe.Run(startup_program, scope, 0);
// Create local scopes // Create local scopes
if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope()); member_->local_scopes_.push_back(&scope->NewScope());
} }
} else {
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
}
}
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif #endif
if (platform::is_gpu_place(places[0]) && if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
member_->local_scopes_.size() != 1) { // Is CUDA local_scopes.empty()) { // Is CUDA
BCastParamsToGPUs(startup_program); BCastParamsToGPUs(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -99,19 +109,18 @@ ParallelExecutor::ParallelExecutor( ...@@ -99,19 +109,18 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const ProgramDesc &startup_program) const { const std::unordered_set<std::string> &vars) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *main_scope = member_->local_scopes_[0]; auto *main_scope = member_->local_scopes_[0];
for (auto *var_desc : startup_program.Block(0).AllVars()) { for (auto &var : vars) {
size_t idx = var_desc->Name().find("@GRAD"); auto *main_var = main_scope->FindVar(var);
if (idx != std::string::npos) continue; if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { continue;
auto &main_tensor = }
main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
auto &main_tensor = main_var->Get<LoDTensor>();
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
...@@ -123,8 +132,7 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -123,8 +132,7 @@ void ParallelExecutor::BCastParamsToGPUs(
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
...@@ -136,13 +144,12 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -136,13 +144,12 @@ void ParallelExecutor::BCastParamsToGPUs(
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
t->mutable_data(cpu, main_tensor.type()); t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t); paddle::framework::TensorCopy(main_tensor, cpu, t);
} }
} }
}
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
} }
#else #else
......
...@@ -36,11 +36,14 @@ class ParallelExecutor { ...@@ -36,11 +36,14 @@ class ParallelExecutor {
explicit ParallelExecutor(size_t num_threads, bool use_event, explicit ParallelExecutor(size_t num_threads, bool use_event,
const std::vector<platform::Place>& places, const std::vector<platform::Place>& places,
const std::unordered_set<std::string>& params, const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program, const std::unordered_set<std::string>& bcast_vars,
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes,
bool allow_op_delay); bool allow_op_delay);
std::vector<Scope*>& GetLocalScopes();
void Run(const std::vector<std::string>& fetch_tensors, void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name, const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors); const std::unordered_map<std::string, LoDTensor>& feed_tensors);
...@@ -51,7 +54,7 @@ class ParallelExecutor { ...@@ -51,7 +54,7 @@ class ParallelExecutor {
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const; void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
}; };
} // namespace framework } // namespace framework
......
...@@ -14,14 +14,13 @@ ...@@ -14,14 +14,13 @@
#pragma once #pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include <memory>
#include <thread>
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -31,8 +30,6 @@ class ReaderBase { ...@@ -31,8 +30,6 @@ class ReaderBase {
virtual void ReInit() = 0; virtual void ReInit() = 0;
virtual bool HasNext() const = 0;
virtual ~ReaderBase(); virtual ~ReaderBase();
}; };
...@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase { ...@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
void ReInit() override { reader_->ReInit(); } void ReInit() override { reader_->ReInit(); }
bool HasNext() const override { return reader_->HasNext(); }
protected: protected:
ReaderBase* reader_; ReaderBase* reader_;
}; };
...@@ -80,8 +75,6 @@ class ReaderHolder { ...@@ -80,8 +75,6 @@ class ReaderHolder {
reader_->ReInit(); reader_->ReInit();
} }
bool HasNext() const { return reader_->HasNext(); }
private: private:
std::unique_ptr<ReaderBase> reader_; std::unique_ptr<ReaderBase> reader_;
}; };
......
...@@ -58,7 +58,7 @@ class Scope { ...@@ -58,7 +58,7 @@ class Scope {
/// nullptr if cannot find. /// nullptr if cannot find.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
const Scope& parent() const { return *parent_; } const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
const Scope* FindScope(const Variable* var) const; const Scope* FindScope(const Variable* var) const;
......
...@@ -46,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1, TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -57,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2, TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place> template <typename Place, bool CreateVars = true>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname, ...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname,
// 6. Run the inference program // 6. Run the inference program
{ {
if (!CreateVars) {
// If users don't want to create and destroy variables every time they
// run, they need to set `create_vars` to false and manually call
// `CreateVariables` before running.
executor.CreateVariables(*inference_program, scope, 0);
}
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname, ...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname,
"run_inference", "run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
} }
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/adagrad_op.h"
#include <vector>
#include <cmath> #include <cmath>
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/assign_value_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
std::vector<float> thresholds_list; std::vector<float> thresholds_list;
thresholds_list.reserve(num_thresholds); thresholds_list.reserve(num_thresholds);
for (int i = 1; i < num_thresholds - 1; i++) { for (int i = 1; i < num_thresholds - 1; i++) {
thresholds_list[i] = (float)i / (num_thresholds - 1); thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
} }
const float kEpsilon = 1e-7; const float kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon; thresholds_list[0] = 0.0f - kEpsilon;
...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace()); float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace()); float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) { for (int i = 0; i < num_thresholds; i++) {
tp_rate_data[i] = tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); (tp_data[i] + fn_data[i] + epsilon);
fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); fp_rate_data[i] =
rec_rate_data[i] = static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
(tp_data[i] + fp_data[i] + epsilon);
} }
*auc_data = 0.0f; *auc_data = 0.0f;
if (curve == "ROC") { if (curve == "ROC") {
......
...@@ -19,15 +19,15 @@ namespace operators { ...@@ -19,15 +19,15 @@ namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CPUDeviceContext>( void GetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
num_updates_ = in_num_updates->data<int64_t>()[0]; *num_updates_ = in_num_updates->data<int64_t>()[0];
} }
template <> template <>
......
...@@ -19,18 +19,18 @@ namespace paddle { ...@@ -19,18 +19,18 @@ namespace paddle {
namespace operators { namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CUDADeviceContext>( void GetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_, memory::Copy(platform::CPUPlace(), old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t), stream); sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream); in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t), stream); in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
} }
......
...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>; ...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext> template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx, void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates, int64_t& num_accumulates, int64_t* num_updates, int64_t* num_accumulates,
int64_t& old_num_accumulates); int64_t* old_num_accumulates);
template <typename DeviceContext> template <typename DeviceContext>
void SetAccumulators(const framework::ExecutionContext& ctx, void SetAccumulators(const framework::ExecutionContext& ctx,
...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
int64_t num_updates = 0; int64_t num_updates = 0;
int64_t num_accumulates = 0; int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0; int64_t old_num_accumulates = 0;
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates, GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
old_num_accumulates); &old_num_accumulates);
// Get attrs // Get attrs
float average_window = ctx.Attr<float>("average_window"); float average_window = ctx.Attr<float>("average_window");
......
...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase { ...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
// TODO(varunarora): Consider moving this root scope lookup to scope.h. // TODO(varunarora): Consider moving this root scope lookup to scope.h.
const framework::Scope *root_scope = &scope; const framework::Scope *root_scope = &scope;
const framework::Scope *parent_scope = &(root_scope->parent()); const framework::Scope *parent_scope = root_scope->parent();
while (parent_scope != nullptr) { while (parent_scope != nullptr) {
root_scope = parent_scope; root_scope = parent_scope;
parent_scope = &(parent_scope->parent()); parent_scope = parent_scope->parent();
} }
framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock); framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
......
...@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase {
std::vector<std::string> out_arg_names = Outputs("Out"); std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
reader->ReInit();
reader->ReadNext(&ins);
PADDLE_ENFORCE(
!ins.empty(),
"Reader can not read the next data even it has been re-initialized.");
}
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
auto* out = auto* out =
......
...@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
StartPrefetcher(); StartPrefetcher();
} }
bool HasNext() const override;
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override; void ReInit() override;
~DoubleBufferReader() { EndPrefetcher(); } ~DoubleBufferReader() { EndPrefetcher(); }
private: private:
bool HasNext() const;
void StartPrefetcher() { void StartPrefetcher() {
channel_ = framework::MakeChannel<Item>(kChannelSize); channel_ = framework::MakeChannel<Item>(kChannelSize);
prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
...@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
auto place_str = Attr<std::string>("place"); auto place_str = Attr<std::string>("place");
platform::Place place; platform::Place place;
if (place_str == "CPU") { if (place_str == "AUTO") {
place = dev_place;
} else if (place_str == "CPU") {
place = platform::CPUPlace(); place = platform::CPUPlace();
} else { } else {
std::istringstream sin(place_str); std::istringstream sin(place_str);
...@@ -140,29 +143,23 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -140,29 +143,23 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
enum_range.insert(string::Sprintf("CUDA:%d", i)); enum_range.insert(string::Sprintf("CUDA:%d", i));
} }
enum_range.insert("CPU"); enum_range.insert("CPU");
AddAttr<std::string>("place", "The double buffer place, default is CPU") enum_range.insert("AUTO");
.SetDefault("CPU") AddAttr<std::string>("place", "The double buffer place")
.SetDefault("AUTO")
.InEnum({enum_range}); .InEnum({enum_range});
} }
}; };
bool DoubleBufferReader::HasNext() const {
while (!channel_->IsClosed() && !channel_->CanReceive()) {
}
return channel_->CanReceive();
}
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!"); if (HasNext()) {
}
Item batch; Item batch;
channel_->Receive(&batch); channel_->Receive(&batch);
*out = batch.payloads_; *out = batch.payloads_;
if (batch.ctx_) { if (batch.ctx_) {
batch.ctx_->Wait(); batch.ctx_->Wait();
} }
}
} }
void DoubleBufferReader::ReInit() { void DoubleBufferReader::ReInit() {
...@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() { ...@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() {
StartPrefetcher(); StartPrefetcher();
} }
bool DoubleBufferReader::HasNext() const {
while (!channel_->IsClosed() && !channel_->CanReceive()) {
}
return channel_->CanReceive();
}
void DoubleBufferReader::PrefetchThreadFunc() { void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts."; VLOG(5) << "A new prefetch thread starts.";
std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize); std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize); std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
size_t cached_tensor_id = 0; size_t cached_tensor_id = 0;
while (reader_->HasNext()) { while (true) {
Item batch; Item batch;
auto& cpu_batch = cpu_tensor_cache[cached_tensor_id]; auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
reader_->ReadNext(&cpu_batch); reader_->ReadNext(&cpu_batch);
if (cpu_batch.empty()) {
// The underlying reader have no next data.
break;
}
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
auto* gpu_ctx = ctxs_[cached_tensor_id].get(); auto* gpu_ctx = ctxs_[cached_tensor_id].get();
......
...@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader { ...@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
: DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
reader_->ReadNext(out); reader_->ReadNext(out);
} if (out->empty()) {
bool HasNext() const override {
if (reader_->HasNext()) {
return true;
} else {
++pass_count_; ++pass_count_;
if (pass_count_ >= pass_num_) { if (pass_count_ < pass_num_) {
return false;
} else {
reader_->ReInit(); reader_->ReInit();
return true; reader_->ReadNext(out);
} }
} }
} }
......
...@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase { ...@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
void ReInit() override { return; } void ReInit() override { return; }
bool HasNext() const override { return true; }
private: private:
float min_; float min_;
float max_; float max_;
...@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase { ...@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
const auto& ranks = Attr<std::vector<int>>("ranks"); const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()), static_cast<int>(shape_concat.size()),
"The accumulate of all ranks should be equal to the " "The accumulate of all ranks should be equal to the "
"shape concat's length."); "shape concat's length.");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks); std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <mutex>
#include <thread>
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
...@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader { ...@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
LOG(INFO) << "Creating file reader" << filename; LOG(INFO) << "Creating file reader" << filename;
} }
bool HasNext() const override { return scanner_.HasNext(); }
void ReInit() override { scanner_.Reset(); } void ReInit() override { scanner_.Reset(); }
protected: protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
if (ThreadSafe) { if (ThreadSafe) {
std::lock_guard<std::mutex> guard(*mutex_); std::lock_guard<std::mutex> guard(*mutex_);
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_); *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
} else { } else {
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_); *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
} }
} }
...@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { ...@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
const auto& ranks = Attr<std::vector<int>>("ranks"); const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()), static_cast<int>(shape_concat.size()),
"The accumulate of all ranks should be equal to the " "The accumulate of all ranks should be equal to the "
"shape concat's length."); "shape concat's length.");
std::string filename = Attr<std::string>("filename"); std::string filename = Attr<std::string>("filename");
......
...@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
std::random_device device; std::random_device device;
seed_ = device(); seed_ = device();
} }
ReadIntoBuffers(); ReloadBuffer();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!");
}
if (iteration_pos_ >= buffer_.size()) { if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer"; VLOG(10) << "Resetting shuffle buffer";
ReadIntoBuffers(); ReloadBuffer();
if (buffer_.empty()) {
return;
} }
*out = buffer_[iteration_pos_++];
} }
*out = buffer_[iteration_pos_++];
bool HasNext() const override {
return iteration_pos_ < buffer_.size() || reader_->HasNext();
} }
private: private:
void ReadIntoBuffers() { void ReloadBuffer() {
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
iteration_pos_ = 0; iteration_pos_ = 0;
for (size_t i = 0; i < buffer_size_; ++i) { for (size_t i = 0; i < buffer_size_; ++i) {
if (!reader_->HasNext()) { std::vector<framework::LoDTensor> ins;
reader_->ReadNext(&ins);
if (ins.empty()) {
break; break;
} }
buffer_.emplace_back(); buffer_.emplace_back(ins);
reader_->ReadNext(&buffer_.back());
} }
std::mt19937 g(seed_); std::mt19937 g(seed_);
std::shuffle(buffer_.begin(), buffer_.end(), g); std::shuffle(buffer_.begin(), buffer_.end(), g);
......
...@@ -21,67 +21,27 @@ namespace reader { ...@@ -21,67 +21,27 @@ namespace reader {
class ThreadedReader : public framework::DecoratedReader { class ThreadedReader : public framework::DecoratedReader {
public: public:
ThreadedReader(ReaderBase* reader, bool unsafe_mode) ThreadedReader(ReaderBase* reader, bool safe_mode)
: DecoratedReader(reader), unsafe_mode_(unsafe_mode) {} : DecoratedReader(reader), safe_mode_(safe_mode) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
if (!unsafe_mode_) {
if (!reader_->HasNext()) {
PADDLE_THROW("There is no next data!");
}
reader_->ReadNext(out); reader_->ReadNext(out);
} else {
auto& thread_buffer = thread_buffers_[std::this_thread::get_id()];
if (thread_buffer.empty()) {
PADDLE_THROW(
"thread_buffer is empty! HasNext() must be invoked before "
"ReadNext() in the same thread.");
}
*out = thread_buffer;
thread_buffer.clear();
}
}
bool HasNext() const override {
if (!unsafe_mode_) {
PADDLE_THROW(
"ThreadedReader::HasNext() is disabled when 'unsafe_mode' is false.");
}
std::thread::id thread_id = std::this_thread::get_id();
std::lock_guard<std::mutex> lock(mutex_);
auto& thread_buffer = thread_buffers_[thread_id];
if (thread_buffer.empty() && reader_->HasNext()) {
reader_->ReadNext(&thread_buffer);
}
return !thread_buffer.empty();
} }
void ReInit() override { void ReInit() override {
if (!unsafe_mode_) { if (safe_mode_) {
PADDLE_THROW( PADDLE_THROW(
"ThreadedReader::ReInit() is disabled when 'unsafe_mode' is false."); "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
} }
VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in " VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
"multi-thread environment."; "multi-thread environment.";
reader_->ReInit(); reader_->ReInit();
} }
~ThreadedReader() {
for (auto& p : thread_buffers_) {
if (!p.second.empty()) {
PADDLE_THROW(
"Find an unused data batch in ThreadedReader! Maybe one thread "
"invokes 'HasNext()' without subsequent 'ReadNext()'.");
}
}
}
private: private:
bool unsafe_mode_; bool safe_mode_;
mutable std::mutex mutex_; std::mutex mutex_;
mutable std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
thread_buffers_;
}; };
class CreateThreadedReaderOp : public framework::OperatorBase { class CreateThreadedReaderOp : public framework::OperatorBase {
...@@ -98,8 +58,8 @@ class CreateThreadedReaderOp : public framework::OperatorBase { ...@@ -98,8 +58,8 @@ class CreateThreadedReaderOp : public framework::OperatorBase {
} }
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
bool unsafe_mode = Attr<bool>("unsafe_mode"); bool safe_mode = Attr<bool>("safe_mode");
out->Reset(new ThreadedReader(underlying_reader.Get(), unsafe_mode)); out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
} }
}; };
...@@ -107,24 +67,19 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -107,24 +67,19 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
public: public:
CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) { : DecoratedReaderMakerBase(op_proto, op_checker) {
AddAttr<bool>("unsafe_mode", AddAttr<bool>("safe_mode",
"When 'unsafe_mode' is false, invoking 'HasNext()' or " "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
"'ReInit()' is not allowed to avoid unexpected bugs in " "unexpected bugs in multi-thread environment.")
"multi-thread environment.") .SetDefault(true);
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
CreateThreadedReader Operator CreateThreadedReader Operator
This operator creates a threaded reader. A threaded reader's This operator creates a threaded reader. A threaded reader's
'ReadNext()' can be invoked by several threads at the same 'ReadNext()' can be invoked by several threads at the same
time. time.
When the attribute 'unsafe_mode' is false, the threaded reader's When the attribute 'safe_mode' is true, the threaded reader's
'HasNext()' and 'ReInit()' will be disabled to avoid unexpected 'ReInit()' is disabled to avoid unexpected bugs in multi-thread
bugs in multi-thread environment. If you really need them, you environment.
can enable them by setting 'unsafe_mode' true. In this case,
'HasNext()' returning true only guarantees the safety of
invoking 'ReadNext()' in the same thread. Each thread must
invoke 'HasNext()' and 'ReadNext()' in pairs.
)DOC"); )DOC");
} }
}; };
...@@ -134,6 +89,6 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -134,6 +89,6 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
} // namespace paddle } // namespace paddle
namespace reader = paddle::operators::reader; namespace reader = paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(create_threaded_reader, REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
reader::CreateThreadedReaderOp, reader::CreateThreadedReaderOp,
reader::CreateThreadedReaderOpMaker); reader::CreateThreadedReaderOpMaker);
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <thread> // NOLINT
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
...@@ -30,12 +32,12 @@ class MultiFileReader : public framework::ReaderBase { ...@@ -30,12 +32,12 @@ class MultiFileReader : public framework::ReaderBase {
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
bool HasNext() const override;
void ReInit() override; void ReInit() override;
~MultiFileReader() { EndScheduler(); } ~MultiFileReader() { EndScheduler(); }
private: private:
bool HasNext();
void StartNewScheduler(); void StartNewScheduler();
void EndScheduler(); void EndScheduler();
void ScheduleThreadFunc(); void ScheduleThreadFunc();
...@@ -52,16 +54,10 @@ class MultiFileReader : public framework::ReaderBase { ...@@ -52,16 +54,10 @@ class MultiFileReader : public framework::ReaderBase {
}; };
void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) { void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!"); if (HasNext()) {
}
buffer_->Receive(out); buffer_->Receive(out);
}
bool MultiFileReader::HasNext() const {
while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
} }
return buffer_->CanReceive();
} }
void MultiFileReader::ReInit() { void MultiFileReader::ReInit() {
...@@ -69,6 +65,12 @@ void MultiFileReader::ReInit() { ...@@ -69,6 +65,12 @@ void MultiFileReader::ReInit() {
StartNewScheduler(); StartNewScheduler();
} }
bool MultiFileReader::HasNext() {
while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
}
return buffer_->CanReceive();
}
void MultiFileReader::StartNewScheduler() { void MultiFileReader::StartNewScheduler() {
size_t thread_num = prefetchers_.size(); size_t thread_num = prefetchers_.size();
waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size()); waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
...@@ -140,9 +142,12 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name, ...@@ -140,9 +142,12 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
VLOG(5) << "The prefetch thread of file '" << file_name << "' starts."; VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
std::unique_ptr<framework::ReaderBase> reader = std::unique_ptr<framework::ReaderBase> reader =
CreateReaderByFileName(file_name, dims_); CreateReaderByFileName(file_name, dims_);
while (reader->HasNext()) { while (true) {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) {
break;
}
try { try {
buffer_->Send(&ins); buffer_->Send(&ins);
} catch (paddle::platform::EnforceNotMet e) { } catch (paddle::platform::EnforceNotMet e) {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/spp_op.h" #include "paddle/fluid/operators/spp_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
......
...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unpool_op.h" #include "paddle/fluid/operators/unpool_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mutex>
namespace paddle {
namespace platform {
/*
The current implementation of std::call_once has a bug described in
https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
This is likely caused by a deeper bug of pthread_once, which is discussed in
https://patchwork.ozlabs.org/patch/482350/
This wrap is a hack to avoid this bug.
*/
template <typename Callable, typename... Args>
inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
bool good = true;
std::exception ex;
try {
std::call_once(flag,
[&](Args&&... args) {
try {
f(args...);
} catch (const std::exception& e) {
ex = e;
good = false;
} catch (...) {
ex = std::runtime_error("excption caught in call_once");
good = false;
}
},
args...);
} catch (std::system_error& x) {
throw std::runtime_error("call once failed");
}
if (!good) {
throw std::exception(ex);
}
}
} // namespace platform
} // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/call_once.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
......
...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { ...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
// Conversion operator // Conversion operator
EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
EXPECT_EQ(float(float16(0.5f)), 0.5f); EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
EXPECT_EQ(int(float16(-1)), -1); EXPECT_EQ(static_cast<int>(float16(-1)), -1);
EXPECT_EQ(bool(float16(true)), true); EXPECT_EQ(static_cast<bool>(float16(true)), true);
} }
TEST(float16, arithmetic_cpu) { TEST(float16, arithmetic_cpu) {
EXPECT_EQ(float(float16(1) + float16(1)), 2); EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
EXPECT_EQ(float(float16(5) + float16(-5)), 0); EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
EXPECT_EQ(float(float16(3) - float16(5)), -2); 0.001);
EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); 0.33334f, 0.001);
EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
EXPECT_EQ(float(-float16(512.0f)), -512.0f); EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
EXPECT_EQ(float(-float16(-512.0f)), 512.0f); 0.001);
EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
} }
TEST(float16, comparison_cpu) { TEST(float16, comparison_cpu) {
......
...@@ -36,19 +36,19 @@ limitations under the License. */ ...@@ -36,19 +36,19 @@ limitations under the License. */
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, size); \ cudaMalloc(reinterpret_cast<void**>(&d_out), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (half*)malloc(size); \ out = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(out[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
free(out); \ free(out); \
...@@ -63,17 +63,17 @@ limitations under the License. */ ...@@ -63,17 +63,17 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \ op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(in1[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
cudaFree(d_in1); \ cudaFree(d_in1); \
...@@ -87,12 +87,12 @@ limitations under the License. */ ...@@ -87,12 +87,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, 1); \ cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (bool*)malloc(1); \ out = reinterpret_cast<bool*>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { ...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc((void**)&d_in, size); cudaMalloc(reinterpret_cast<void**>(&d_in), size);
in = (half*)malloc(size); in = reinterpret_cast<half*>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
cudaFree(d_in); cudaFree(d_in);
} }
......
...@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::reference); py::return_value_policy::reference);
py::class_<framework::ReaderHolder>(m, "Reader", "") py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("has_next", &framework::ReaderHolder::HasNext)
.def("reset", &framework::ReaderHolder::ReInit); .def("reset", &framework::ReaderHolder::ReInit);
py::class_<Scope>(m, "Scope", "") py::class_<Scope>(m, "Scope", "")
...@@ -544,13 +543,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -544,13 +543,20 @@ All parameter, weight, gradient are variables in Paddle.
[](ParallelExecutor &self, size_t num_threads, bool use_event, [](ParallelExecutor &self, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, bool allow_op_delay) { Scope *scope, std::vector<Scope *> &local_scopes,
new (&self) ParallelExecutor(num_threads, use_event, places, bool allow_op_delay) {
params, startup_program, main_program, new (&self)
loss_var_name, scope, allow_op_delay); ParallelExecutor(num_threads, use_event, places, params,
bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay);
}) })
.def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("run", &ParallelExecutor::Run); .def("run", &ParallelExecutor::Run);
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
......
...@@ -39,7 +39,7 @@ class RecordIOWriter { ...@@ -39,7 +39,7 @@ class RecordIOWriter {
void CompleteAppendTensor() { void CompleteAppendTensor() {
auto& ctx = auto& ctx =
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
framework::WriteToRecordIO(writer_, tensors_, ctx); framework::WriteToRecordIO(&writer_, tensors_, ctx);
tensors_.clear(); tensors_.clear();
} }
......
*pyc *pyc
build build
dist dist
paddlepaddle.egg-info
paddle.egg-info paddle.egg-info
paddlepaddle_gpu.egg-info paddlepaddle_gpu.egg-info
.idea .idea
......
...@@ -102,6 +102,8 @@ def split_dense_variable(var_list, ...@@ -102,6 +102,8 @@ def split_dense_variable(var_list,
the parameter server side can gain better performance. By default the parameter server side can gain better performance. By default
minimum block size is 1024. The max block size is used to prevent minimum block size is 1024. The max block size is used to prevent
very large blocks that may cause send error. very large blocks that may cause send error.
:return: A list of VarBlocks. Each VarBlock specifies a shard of
the var.
""" """
blocks = [] blocks = []
for var in var_list: for var in var_list:
...@@ -192,22 +194,24 @@ class DistributeTranspiler: ...@@ -192,22 +194,24 @@ class DistributeTranspiler:
self.trainer_id = trainer_id self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",") pserver_endpoints = pservers.split(",")
# step1 # step1: For large parameters and gradients, split them into smaller
# blocks.
param_list = [pg[0] for pg in params_grads] param_list = [pg[0] for pg in params_grads]
grad_list = [pg[1] for pg in params_grads] grad_list = [pg[1] for pg in params_grads]
grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
# step2 # step2: Create new vars for the parameters and gradients blocks and
# add ops to do the split.
grad_var_mapping = self._append_split_op(program, grad_blocks) grad_var_mapping = self._append_split_op(program, grad_blocks)
# step3 param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
# step3: Add gradients as send op inputs and parameters as send
# op outputs.
send_inputs = [] send_inputs = []
send_outputs = [] send_outputs = []
for b in grad_blocks: # append by order for b in grad_blocks: # append by order
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_inputs.append(grad_var_mapping[varname][int(block_id)]) send_inputs.append(grad_var_mapping[varname][int(block_id)])
param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
for b in param_blocks: for b in param_blocks:
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_outputs.append(param_var_mapping[varname][int(block_id)]) send_outputs.append(param_var_mapping[varname][int(block_id)])
...@@ -237,7 +241,7 @@ class DistributeTranspiler: ...@@ -237,7 +241,7 @@ class DistributeTranspiler:
"RPCClient": rpc_client_var}, "RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": eplist}) "epmap": eplist})
# step4 # step4: Concat the parameters splits together after recv.
for varname, splited_var in param_var_mapping.iteritems(): for varname, splited_var in param_var_mapping.iteritems():
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
...@@ -258,13 +262,14 @@ class DistributeTranspiler: ...@@ -258,13 +262,14 @@ class DistributeTranspiler:
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
""" """
Get pserver side program using the endpoint. Get pserver side program using the endpoint.
TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
NOTE: assume blocks of the same variable is not distributed NOTE: assume blocks of the same variable is not distributed
on the same pserver, only change param/grad varnames for on the same pserver, only change param/grad varnames for
trainers to fetch. trainers to fetch.
""" """
# step1 # step1
pserver_program = Program() pserver_program = Program()
# step2 # step2: Create vars to receive vars at parameter servers.
recv_inputs = [] recv_inputs = []
for v in self.param_grad_ep_mapping[endpoint]["params"]: for v in self.param_grad_ep_mapping[endpoint]["params"]:
self._clone_var(pserver_program.global_block(), v) self._clone_var(pserver_program.global_block(), v)
...@@ -278,12 +283,6 @@ class DistributeTranspiler: ...@@ -278,12 +283,6 @@ class DistributeTranspiler:
orig_var_name = v.name[:suff_idx] orig_var_name = v.name[:suff_idx]
else: else:
orig_var_name = v.name orig_var_name = v.name
single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
if self.trainers > 1: if self.trainers > 1:
for trainer_id in xrange(self.trainers): for trainer_id in xrange(self.trainers):
var = pserver_program.global_block().create_var( var = pserver_program.global_block().create_var(
...@@ -294,6 +293,12 @@ class DistributeTranspiler: ...@@ -294,6 +293,12 @@ class DistributeTranspiler:
shape=v.shape) shape=v.shape)
recv_inputs.append(var) recv_inputs.append(var)
else: else:
single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
recv_inputs.append(single_trainer_var) recv_inputs.append(single_trainer_var)
# step3 # step3
...@@ -344,7 +349,7 @@ class DistributeTranspiler: ...@@ -344,7 +349,7 @@ class DistributeTranspiler:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
append_block = optimize_block append_block = optimize_block
# append lr decay ops to the child block if exits # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
if len(lr_ops) > 0: if len(lr_ops) > 0:
for _, op in enumerate(lr_ops): for _, op in enumerate(lr_ops):
...@@ -447,8 +452,10 @@ class DistributeTranspiler: ...@@ -447,8 +452,10 @@ class DistributeTranspiler:
block_list, block_list,
add_trainer_suffix=False): add_trainer_suffix=False):
""" """
Create vars for each split.
NOTE: only grads need to be named for different trainers, use NOTE: only grads need to be named for different trainers, use
add_trainer_suffix to rename the grad vars. add_trainer_suffix to rename the grad vars.
:return: A dict mapping from original var name to each var split.
""" """
block_map = dict() block_map = dict()
var_mapping = dict() var_mapping = dict()
...@@ -615,6 +622,7 @@ class DistributeTranspiler: ...@@ -615,6 +622,7 @@ class DistributeTranspiler:
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}) outputs={"Out": merged_var})
# TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op( optimize_block.append_op(
type="scale", type="scale",
...@@ -638,7 +646,7 @@ class DistributeTranspiler: ...@@ -638,7 +646,7 @@ class DistributeTranspiler:
shape=param_block.shape) shape=param_block.shape)
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
elif key == "LearningRate": elif key == "LearningRate":
# leraning rate variable has already be created by non-optimize op, # learning rate variable has already be created by non-optimize op,
# don't create it once again. # don't create it once again.
lr_varname = opt_op.input(key)[0] lr_varname = opt_op.input(key)[0]
if pserver_block.vars.has_key(lr_varname): if pserver_block.vars.has_key(lr_varname):
...@@ -773,6 +781,7 @@ class DistributeTranspiler: ...@@ -773,6 +781,7 @@ class DistributeTranspiler:
return False return False
def _get_input_map_from_op(self, varmap, op): def _get_input_map_from_op(self, varmap, op):
"""Returns a dict from op input name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.input_names: for key in op.input_names:
vars = [] vars = []
...@@ -785,6 +794,7 @@ class DistributeTranspiler: ...@@ -785,6 +794,7 @@ class DistributeTranspiler:
return iomap return iomap
def _get_output_map_from_op(self, varmap, op): def _get_output_map_from_op(self, varmap, op):
"""Returns a dict from op output name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.output_names: for key in op.output_names:
vars = [] vars = []
...@@ -812,6 +822,7 @@ class DistributeTranspiler: ...@@ -812,6 +822,7 @@ class DistributeTranspiler:
find_ops.append(op) find_ops.append(op)
# make a union find struct by the ops in default_main_program # make a union find struct by the ops in default_main_program
ufind = UnionFind(block.ops) ufind = UnionFind(block.ops)
for op1 in block.ops: for op1 in block.ops:
for op2 in block.ops: for op2 in block.ops:
# NOTE: we need to skip all optimize ops, since it is connected # NOTE: we need to skip all optimize ops, since it is connected
......
...@@ -659,7 +659,7 @@ class Block(object): ...@@ -659,7 +659,7 @@ class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = dict() # var_name --> var self.vars = dict() # var_name --> var
self.ops = collections.deque() # operator list self.ops = list() # operator list
self.program = program self.program = program
self.removed_vars = dict() self.removed_vars = dict()
...@@ -831,6 +831,13 @@ class Block(object): ...@@ -831,6 +831,13 @@ class Block(object):
self.ops.append(op) self.ops.append(op)
return op return op
def insert_op(self, index, *args, **kwargs):
self.sync_with_cpp()
op_desc = self.desc.insert_op(index)
op = Operator(block=self, desc=op_desc, *args, **kwargs)
self.ops.insert(index, op)
return op
def delete_ops(self, ops): def delete_ops(self, ops):
# remove from cpp # remove from cpp
# FIXME(typhoonzero): remove only the first occurrence. # FIXME(typhoonzero): remove only the first occurrence.
...@@ -842,12 +849,12 @@ class Block(object): ...@@ -842,12 +849,12 @@ class Block(object):
self.desc.remove_op(start, end + 1) self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end): def slice_ops(self, start, end):
return list(self.ops)[start:end] return self.ops[start:end]
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
self.ops.appendleft(op) self.ops.insert(0, op)
return op return op
def sync_with_cpp(self): def sync_with_cpp(self):
...@@ -892,7 +899,7 @@ class Block(object): ...@@ -892,7 +899,7 @@ class Block(object):
for index in range((start_index - 1 - 1), -1, -1): for index in range((start_index - 1 - 1), -1, -1):
op_desc = ops_in_cpp[index] op_desc = ops_in_cpp[index]
op = Operator(self, op_desc) op = Operator(self, op_desc)
self.ops.appendleft(op) self.ops.insert(0, op)
# sync ops append to the end of cpp_ops # sync ops append to the end of cpp_ops
for index in range((end_index + 1), len(ops_in_cpp)): for index in range((end_index + 1), len(ops_in_cpp)):
......
...@@ -236,13 +236,9 @@ def monkey_patch_reader_methods(reader): ...@@ -236,13 +236,9 @@ def monkey_patch_reader_methods(reader):
var = scope.find_var(reader.name) var = scope.find_var(reader.name)
return var.get_reader() return var.get_reader()
def eof():
return not __get_reader__().has_next()
def reset(): def reset():
return __get_reader__().reset() return __get_reader__().reset()
reader.eof = eof
reader.reset = reset reader.reset = reset
reader.stop_gradient = True reader.stop_gradient = True
reader.persistable = True reader.persistable = True
...@@ -299,8 +295,7 @@ def open_recordio_file(filename, ...@@ -299,8 +295,7 @@ def open_recordio_file(filename,
shapes(list): List of tuples which declaring data shapes. shapes(list): List of tuples which declaring data shapes.
lod_levels(list): List of ints which declaring data lod_level. lod_levels(list): List of ints which declaring data lod_level.
dtypes(list): List of strs which declaring data type. dtypes(list): List of strs which declaring data type.
pass_num(int): Number of passes to run. After completing the pass_num(int): Number of passes to run.
given number of passes, 'has_next()' will return False.
for_parallel(Bool): Set it as True if you are going to run for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel. subsequent operators in parallel.
...@@ -377,8 +372,7 @@ def open_files(filenames, ...@@ -377,8 +372,7 @@ def open_files(filenames,
dtypes(list): List of strs which declaring data type. dtypes(list): List of strs which declaring data type.
thread_num(int): The maximal concurrent prefetch thread number. thread_num(int): The maximal concurrent prefetch thread number.
buffer_size(int): The size of prefetch buffer. buffer_size(int): The size of prefetch buffer.
pass_num(int): Number of passes to run. After completing the pass_num(int): Number of passes to run.
given number of passes, 'has_next()' will return False.
for_parallel(Bool): Set it as True if you are going to run for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel. subsequent operators in parallel.
...@@ -440,7 +434,7 @@ def open_files(filenames, ...@@ -440,7 +434,7 @@ def open_files(filenames,
return monkey_patch_reader_methods(main_prog_reader) return monkey_patch_reader_methods(main_prog_reader)
def __create_decorated_reader__(op_type, reader, attrs={}): def __create_shared_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type) var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_var = startup_blk.create_var(name=var_name)
...@@ -456,26 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs={}): ...@@ -456,26 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs={}):
return monkey_patch_reader_methods(main_prog_var) return monkey_patch_reader_methods(main_prog_var)
def __create_unshared_decorated_reader__(op_type, reader, attrs):
new_reader_name = unique_name(op_type)
main_blk = default_main_program().current_block()
new_reader = main_blk.create_var(name=new_reader_name)
main_blk.append_op(
type=op_type,
inputs={'UnderlyingReader': reader},
outputs={'Out': [new_reader]},
attrs=attrs)
new_reader.persistable = True
new_reader.stop_gradient = True
return monkey_patch_reader_methods(new_reader)
def shuffle(reader, buffer_size): def shuffle(reader, buffer_size):
return __create_decorated_reader__('create_shuffle_reader', reader, return __create_unshared_decorated_reader__(
{'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
def double_buffer(reader, place=None): def double_buffer(reader, place=None):
attrs = dict() attrs = dict()
if place is not None: if place is not None:
attrs['place'] = str(place).upper() attrs['place'] = str(place).upper()
return __create_decorated_reader__('create_double_buffer_reader', reader, return __create_unshared_decorated_reader__('create_double_buffer_reader',
attrs) reader, attrs)
def multi_pass(reader, pass_num): def multi_pass(reader, pass_num):
return __create_decorated_reader__('create_multi_pass_reader', reader, return __create_shared_decorated_reader__(
{'pass_num': int(pass_num)}) 'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
def parallel(reader): def parallel(reader):
return __create_decorated_reader__('create_threaded_reader', reader) return __create_shared_decorated_reader__('create_threaded_reader', reader,
{})
def read_file(file_obj): def read_file(file_obj):
......
...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] ...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object): class ParallelExecutor(object):
def __init__(self, def __init__(self,
loss_name,
use_cuda, use_cuda,
loss_name=None,
main_program=None,
num_threads=None, num_threads=None,
allow_op_delay=False): allow_op_delay=False,
share_vars_from=None):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda(bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training.
main_program(Program, default None): The program that need to run,
if not provided, then default_main_program will be used.
num_threads(int, default None): How many threads are used for
training.
allow_op_delay(bool, default False): Whether to delay and buffer
some operators together for scheduling or not, which may
improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor.
Returns:
A ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor
object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
"""
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
...@@ -50,10 +89,21 @@ class ParallelExecutor(object): ...@@ -50,10 +89,21 @@ class ParallelExecutor(object):
else: else:
min(len(self._places) * 2, multiprocessing.cpu_count()) min(len(self._places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program() main = main_program
main = framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, main.list_vars())
]
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
num_threads, num_threads,
True if use_cuda else False, # use_event True if use_cuda else False, # use_event
...@@ -62,10 +112,11 @@ class ParallelExecutor(object): ...@@ -62,10 +112,11 @@ class ParallelExecutor(object):
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
startup.desc, set(persistable_vars),
main.desc, main.desc,
loss_name, loss_name if loss_name else '',
scope, scope,
local_scopes,
allow_op_delay) allow_op_delay)
self.scope = scope self.scope = scope
......
...@@ -26,11 +26,14 @@ def simple_fc_net(use_feed): ...@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32') img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else: else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_files(
filename='./mnist.recordio', filenames=['./mnist.recordio'],
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader) img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
for _ in xrange(4): for _ in xrange(4):
...@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed): ...@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32') img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else: else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_files(
filename='./mnist.recordio', filenames=['mnist.recordio'],
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader) img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
...@@ -207,7 +213,11 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -207,7 +213,11 @@ class TestParallelExecutorBase(unittest.TestCase):
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place)
startup_exe.run(startup)
exe = fluid.ParallelExecutor(True, loss_name=loss.name)
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count() batch_size *= fluid.core.get_cuda_device_count()
begin = time.time() begin = time.time()
...@@ -453,3 +463,41 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -453,3 +463,41 @@ class TestTransformer(TestParallelExecutorBase):
@unittest.skip("transformer is buggy in multi gpu") @unittest.skip("transformer is buggy in multi gpu")
def test_main(self): def test_main(self):
self.check_network_convergence(transformer) self.check_network_convergence(transformer)
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.0001)
opt.minimize(loss)
batch_size = 32
image = numpy.random.normal(size=(batch_size,
784)).astype('float32')
label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
test_loss = numpy.array(test_loss)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
train_loss = numpy.array(train_loss)
self.assertTrue(numpy.allclose(train_loss, test_loss))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册