提交 22f03a1b 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into multigpumultinode

...@@ -48,6 +48,13 @@ parser.add_argument( ...@@ -48,6 +48,13 @@ parser.add_argument(
type=int, type=int,
default=16, default=16,
help="The sequence number of a mini-batch data. (default: %(default)d)") help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
"--dict_size", "--dict_size",
type=int, type=int,
...@@ -72,16 +79,21 @@ parser.add_argument( ...@@ -72,16 +79,21 @@ parser.add_argument(
default=3, default=3,
help="The width for beam searching. (default: %(default)d)") help="The width for beam searching. (default: %(default)d)")
parser.add_argument( parser.add_argument(
"--use_gpu", '--device',
type=distutils.util.strtobool, type=str,
default=True, default='GPU',
help="Whether to use gpu. (default: %(default)d)") choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument( parser.add_argument(
"--max_length", "--max_length",
type=int, type=int,
default=250, default=250,
help="The maximum length of sequence when doing generation. " help="The maximum length of sequence when doing generation. "
"(default: %(default)d)") "(default: %(default)d)")
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
...@@ -281,7 +293,7 @@ def train(): ...@@ -281,7 +293,7 @@ def train():
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size) batch_size=args.batch_size)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -307,14 +319,20 @@ def train(): ...@@ -307,14 +319,20 @@ def train():
return total_loss / count return total_loss / count
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in xrange(args.pass_num): for pass_id in xrange(args.pass_num):
pass_start_time = time.time() train_accs = []
words_seen = 0 train_losses = []
for batch_id, data in enumerate(train_batch_generator()): for batch_id, data in enumerate(train_batch_generator()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
words_seen += word_num num_samples += word_num
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
words_seen += word_num num_samples += word_num
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
fetch_outs = exe.run(framework.default_main_program(), fetch_outs = exe.run(framework.default_main_program(),
...@@ -325,24 +343,36 @@ def train(): ...@@ -325,24 +343,36 @@ def train():
}, },
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(fetch_outs[0]) iters += 1
print('pass_id=%d, batch_id=%d, train_loss: %f' % loss = np.array(fetch_outs[0])
(pass_id, batch_id, avg_cost_val)) print(
"Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
pass_end_time = time.time() ) # The accuracy is the accumulation of batches, but not the current batch.
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_loss = do_validation() test_loss = do_validation()
time_consumed = pass_end_time - pass_start_time exit(0)
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
def infer(): def infer():
pass pass
def print_arguments(args):
print('----------- seq2seq Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args)
if args.infer_only: if args.infer_only:
infer() infer()
else: else:
......
...@@ -35,6 +35,12 @@ def parse_args(): ...@@ -35,6 +35,12 @@ def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.") parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.') '--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument( parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.') '--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
...@@ -53,19 +59,14 @@ def parse_args(): ...@@ -53,19 +59,14 @@ def parse_args():
'--use_nvprof', '--use_nvprof',
action='store_true', action='store_true',
help='If set, use nvprof for CUDA.') help='If set, use nvprof for CUDA.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def cnn_model(data): def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool( conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data, input=data,
...@@ -161,16 +162,22 @@ def run_benchmark(model, args): ...@@ -161,16 +162,22 @@ def run_benchmark(model, args):
paddle.dataset.mnist.train(), batch_size=args.batch_size) paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage() accuracy = fluid.average.WeightedAverage()
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
accuracy.reset() accuracy.reset()
pass_start = time.time() train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
img_data = np.array( img_data = np.array(
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([len(y_data), 1]) y_data = y_data.reshape([len(y_data), 1])
start = time.time()
outs = exe.run( outs = exe.run(
fluid.default_main_program(), fluid.default_main_program(),
feed={"pixel": img_data, feed={"pixel": img_data,
...@@ -178,21 +185,36 @@ def run_benchmark(model, args): ...@@ -178,21 +185,36 @@ def run_benchmark(model, args):
fetch_list=[avg_cost, batch_acc, batch_size_tensor] fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2]) accuracy.add(value=outs[1], weight=outs[2])
end = time.time() iters += 1
num_samples += len(y_data)
loss = np.array(outs[0]) loss = np.array(outs[0])
acc = np.array(outs[1]) acc = np.array(outs[1])
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % train_losses.append(loss)
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
pass_end = time.time() (pass_id, iters, loss, acc))
train_avg_acc = accuracy.eval() print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program) inference_program)
exit(0)
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
(pass_id, train_avg_acc, test_avg_acc, def print_arguments(args):
(pass_end - pass_start) / 1000)) vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- mnist Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -87,15 +87,6 @@ def parse_args(): ...@@ -87,15 +87,6 @@ def parse_args():
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
conv1 = fluid.layers.conv2d( conv1 = fluid.layers.conv2d(
input=input, input=input,
...@@ -279,32 +270,31 @@ def run_benchmark(model, args): ...@@ -279,32 +270,31 @@ def run_benchmark(model, args):
'label': label}, 'label': label},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
iters += 1 iters += 1
num_samples += label[0] num_samples += len(label)
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc)) (pass_id, iters, loss, acc))
pass_train_acc = accuracy.eval()
# evaluation
if args.with_test:
pass_test_acc = test(exe)
train_elapsed = time.time() - start_time
print("Pass: %d, Loss: %f, Train Accuray: %f\n" % print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs))) (pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec)) (num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
pass_test_acc = test(exe)
exit(0)
if args.use_cprof:
pr.disable() def print_arguments(args):
s = StringIO.StringIO() vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
sortby = 'cumulative' vars(args)['device'] == 'GPU')
ps = pstats.Stats(pr, stream=s).sort_stats(sortby) print('----------- resnet Configuration Arguments -----------')
ps.print_stats() for arg, value in sorted(vars(args).iteritems()):
print(s.getvalue()) print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
#!/bin/bash #!/bin/bash
# This script benchmarking the PaddlePaddle Fluid on # This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU. # single thread single GPU.
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
# disable openmp and mkl parallel # disable openmp and mkl parallel
#https://github.com/PaddlePaddle/Paddle/issues/7199 #https://github.com/PaddlePaddle/Paddle/issues/7199
...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
# only query the gpu used
nohup stdbuf -oL nvidia-smi \
--id=${CUDA_VISIBLE_DEVICES} \
--query-gpu=timestamp \
--query-compute-apps=pid,process_name,used_memory \
--format=csv \
--filename=mem.log \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
2>&1 | tee -a mnist_gpu_128.log
# vgg16 # vgg16
# cifar10 gpu cifar10 128 # gpu cifar10 128
FLAGS_benchmark=true python fluid/vgg.py \ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > vgg16_gpu_128.log 2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a vgg16_gpu_flowers_32.log
# resnet50 # resnet50
# resnet50 gpu cifar10 128 # resnet50 gpu cifar10 128
FLAGS_benchmark=true python fluid/resnet.py \ FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--data_set=cifar10 \ --data_set=cifar10 \
--model=resnet_cifar10 \ --model=resnet_cifar10 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > resnet50_gpu_128.log 2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm # lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a lstm_gpu_128.log
...@@ -37,6 +37,14 @@ def parse_args(): ...@@ -37,6 +37,14 @@ def parse_args():
type=int, type=int,
default=32, default=32,
help='The sequence number of a batch data. (default: %(default)d)') help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
'--emb_dim', '--emb_dim',
type=int, type=int,
...@@ -64,6 +72,10 @@ def parse_args(): ...@@ -64,6 +72,10 @@ def parse_args():
default=int(os.environ.get('CROP_SIZE', '1500')), default=int(os.environ.get('CROP_SIZE', '1500')),
help='The max sentence length of input. Since this model use plain RNN,' help='The max sentence length of input. Since this model use plain RNN,'
' Gradient could be explored if sentence is too long') ' Gradient could be explored if sentence is too long')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -157,20 +169,23 @@ def main(): ...@@ -157,20 +169,23 @@ def main():
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
def train_loop(pass_num, crop_size):
with profiler.profiler(args.device, 'total') as prof:
for pass_id in range(pass_num):
train_reader = batch( train_reader = batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), crop_sentence(imdb.train(word_dict), args.crop_size),
buf_size=25000), buf_size=25000),
batch_size=args.batch_size) batch_size=args.batch_size)
word_nums = 0
pass_start_time = time.time() iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
tensor_words = to_lodtensor([x[0] for x in data], place) tensor_words = to_lodtensor([x[0] for x in data], place)
for x in data:
word_nums += len(x[0])
label = numpy.array([x[1] for x in data]).astype("int64") label = numpy.array([x[1] for x in data]).astype("int64")
label = label.reshape((-1, 1)) label = label.reshape((-1, 1))
loss_np, acc, weight = exe.run( loss_np, acc, weight = exe.run(
...@@ -178,16 +193,19 @@ def main(): ...@@ -178,16 +193,19 @@ def main():
feed={"words": tensor_words, feed={"words": tensor_words,
"label": label}, "label": label},
fetch_list=[loss, batch_acc, batch_size_tensor]) fetch_list=[loss, batch_acc, batch_size_tensor])
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % iters += 1
(pass_id, batch_id, loss_np, acc)) for x in data:
num_samples += len(x[0])
pass_end_time = time.time() print(
time_consumed = pass_end_time - pass_start_time "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
words_per_sec = word_nums / time_consumed (pass_id, iters, loss_np, acc)
print("pass_id=%d, sec/pass: %f, words/s: %f" % ) # The accuracy is the accumulation of batches, but not the current batch.
(pass_id, time_consumed, words_per_sec))
train_loop(args.pass_num, args.crop_size) train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
exit(0)
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -205,5 +223,14 @@ def to_lodtensor(data, place): ...@@ -205,5 +223,14 @@ def to_lodtensor(data, place):
return res return res
def print_arguments(args):
print('----------- lstm Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args()
print_arguments(args)
main() main()
...@@ -191,25 +191,29 @@ def main(): ...@@ -191,25 +191,29 @@ def main():
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
iters += 1 iters += 1
num_samples += len(data) num_samples += len(y_data)
print( print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc) (pass_id, iters, loss, acc)
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_train_acc = accuracy.eval() # pass_train_acc = accuracy.eval()
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation # evaluation
if args.with_test: if args.with_test:
pass_test_acc = test(exe) pass_test_acc = test(exe)
train_elapsed = time.time() - start_time exit(0)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
def print_arguments(): def print_arguments():
print('----------- Configuration Arguments -----------') print('----------- vgg Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()): for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value)) print('%s: %s' % (arg, value))
print('------------------------------------------------') print('------------------------------------------------')
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.python.ops import array_ops
from tensorflow.python.util import nest
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
import numpy as np
import os
import argparse
import time
import paddle.v2 as paddle
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
type=int,
default=512,
help="The dimension of embedding table. (default: %(default)d)")
parser.add_argument(
"--encoder_size",
type=int,
default=512,
help="The size of encoder bi-rnn unit. (default: %(default)d)")
parser.add_argument(
"--decoder_size",
type=int,
default=512,
help="The size of decoder rnn unit. (default: %(default)d)")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
"--dict_size",
type=int,
default=30000,
help="The dictionary capacity. Dictionaries of source sequence and "
"target dictionary have same capacity. (default: %(default)d)")
parser.add_argument(
"--max_time_steps",
type=int,
default=81,
help="Max number of time steps for sequence. (default: %(default)d)")
parser.add_argument(
"--pass_num",
type=int,
default=10,
help="The pass number to train. (default: %(default)d)")
parser.add_argument(
"--learning_rate",
type=float,
default=0.0002,
help="Learning rate used to train the model. (default: %(default)f)")
parser.add_argument(
"--infer_only", action='store_true', help="If set, run forward only.")
parser.add_argument(
"--beam_size",
type=int,
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--max_generation_length",
type=int,
default=250,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
"--save_freq",
type=int,
default=500,
help="Save model checkpoint every this interation. (default: %(default)d)")
parser.add_argument(
"--model_dir",
type=str,
default='./checkpoint',
help="Path to save model checkpoints. (default: %(default)d)")
_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name
START_TOKEN_IDX = 0
END_TOKEN_IDX = 1
class LSTMCellWithSimpleAttention(RNNCell):
"""Add attention mechanism to BasicLSTMCell.
This class is a wrapper based on tensorflow's `BasicLSTMCell`.
"""
def __init__(self,
num_units,
encoder_vector,
encoder_proj,
source_sequence_length,
forget_bias=1.0,
state_is_tuple=True,
activation=None,
reuse=None):
super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
if not state_is_tuple:
logging.warn("%s: Using a concatenated state is slower and will "
"soon be deprecated. Use state_is_tuple=True.", self)
self._num_units = num_units
# set padding part to 0
self._encoder_vector = self._reset_padding(encoder_vector,
source_sequence_length)
self._encoder_proj = self._reset_padding(encoder_proj,
source_sequence_length)
self._forget_bias = forget_bias
self._state_is_tuple = state_is_tuple
self._activation = activation or math_ops.tanh
self._linear = None
@property
def state_size(self):
return (LSTMStateTuple(self._num_units, self._num_units) \
if self._state_is_tuple else 2 * self._num_units)
@property
def output_size(self):
return self._num_units
def zero_state(self, batch_size, dtype):
state_size = self.state_size
if hasattr(self, "_last_zero_state"):
(last_state_size, last_batch_size, last_dtype,
last_output) = getattr(self, "_last_zero_state")
if (last_batch_size == batch_size and last_dtype == dtype and
last_state_size == state_size):
return last_output
with ops.name_scope(
type(self).__name__ + "ZeroState", values=[batch_size]):
output = _zero_state_tensors(state_size, batch_size, dtype)
self._last_zero_state = (state_size, batch_size, dtype, output)
return output
def call(self, inputs, state):
sigmoid = math_ops.sigmoid
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
# get context from encoder outputs
context = self._simple_attention(self._encoder_vector,
self._encoder_proj, h)
if self._linear is None:
self._linear = _Linear([inputs, context, h], 4 * self._num_units,
True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(
value=self._linear([inputs, context, h]),
num_or_size_splits=4,
axis=1)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat([new_c, new_h], 1)
return new_h, new_state
def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
"""Implement the attention function.
The implementation has the same logic to the fluid decoder.
"""
decoder_state_proj = tf.contrib.layers.fully_connected(
inputs=decoder_state,
num_outputs=self._num_units,
activation_fn=None,
biases_initializer=None)
decoder_state_expand = tf.tile(
tf.expand_dims(
input=decoder_state_proj, axis=1),
[1, tf.shape(encoder_proj)[1], 1])
concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
# need reduce the first dimension
attention_weights = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
concated, shape=[-1, self._num_units * 2]),
num_outputs=1,
activation_fn=tf.nn.tanh,
biases_initializer=None)
attention_weights_reshaped = tf.reshape(
attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
# normalize the attention weights using softmax
attention_weights_normed = tf.nn.softmax(
attention_weights_reshaped, dim=1)
scaled = tf.multiply(attention_weights_normed, encoder_vec)
context = tf.reduce_sum(scaled, axis=1)
return context
def _reset_padding(self,
memory,
memory_sequence_length,
check_inner_dims_defined=True):
"""Reset the padding part for encoder inputs.
This funtion comes from tensorflow's `_prepare_memory` function.
"""
memory = nest.map_structure(
lambda m: ops.convert_to_tensor(m, name="memory"), memory)
if memory_sequence_length is not None:
memory_sequence_length = ops.convert_to_tensor(
memory_sequence_length, name="memory_sequence_length")
if check_inner_dims_defined:
def _check_dims(m):
if not m.get_shape()[2:].is_fully_defined():
raise ValueError(
"Expected memory %s to have fully defined inner dims, "
"but saw shape: %s" % (m.name, m.get_shape()))
nest.map_structure(_check_dims, memory)
if memory_sequence_length is None:
seq_len_mask = None
else:
seq_len_mask = array_ops.sequence_mask(
memory_sequence_length,
maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
dtype=nest.flatten(memory)[0].dtype)
seq_len_batch_size = (memory_sequence_length.shape[0].value or
array_ops.shape(memory_sequence_length)[0])
def _maybe_mask(m, seq_len_mask):
rank = m.get_shape().ndims
rank = rank if rank is not None else array_ops.rank(m)
extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
if memory_sequence_length is not None:
message = ("memory_sequence_length and memory tensor "
"batch sizes do not match.")
with ops.control_dependencies([
check_ops.assert_equal(
seq_len_batch_size, m_batch_size, message=message)
]):
seq_len_mask = array_ops.reshape(
seq_len_mask,
array_ops.concat(
(array_ops.shape(seq_len_mask), extra_ones), 0))
return m * seq_len_mask
else:
return m
return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
memory)
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size,
max_generation_length):
src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
src_embedding_weights = tf.get_variable("source_word_embeddings",
[source_dict_dim, embedding_dim])
src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
# no peephole
encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=src_forward_cell,
cell_bw=src_reversed_cell,
inputs=src_embedding,
sequence_length=src_sequence_length,
dtype=tf.float32)
# concat the forward outputs and backward outputs
encoded_vec = tf.concat(encoder_outputs, axis=2)
# project the encoder outputs to size of decoder lstm
encoded_proj = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
encoded_vec, shape=[-1, embedding_dim * 2]),
num_outputs=decoder_size,
activation_fn=None,
biases_initializer=None)
encoded_proj_reshape = tf.reshape(
encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
# get init state for decoder lstm's H
backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
decoder_boot = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
backword_first, shape=[-1, embedding_dim]),
num_outputs=decoder_size,
activation_fn=tf.nn.tanh,
biases_initializer=None)
# prepare the initial state for decoder lstm
cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
initial_state = LSTMStateTuple(cell_init, decoder_boot)
# create decoder lstm cell
decoder_cell = LSTMCellWithSimpleAttention(
decoder_size,
encoded_vec
if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
encoded_proj_reshape if not is_generating else
seq2seq.tile_batch(encoded_proj_reshape, beam_size),
src_sequence_length if not is_generating else
seq2seq.tile_batch(src_sequence_length, beam_size),
forget_bias=0.0)
output_layer = Dense(target_dict_dim, name='output_projection')
if not is_generating:
trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
trg_word_idx)
training_helper = seq2seq.TrainingHelper(
inputs=trg_embedding,
sequence_length=trg_sequence_length,
time_major=False,
name='training_helper')
training_decoder = seq2seq.BasicDecoder(
cell=decoder_cell,
helper=training_helper,
initial_state=initial_state,
output_layer=output_layer)
# get the max length of target sequence
max_decoder_length = tf.reduce_max(trg_sequence_length)
decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
decoder=training_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_decoder_length)
decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
decoder_pred_train = tf.argmax(
decoder_logits_train, axis=-1, name='decoder_pred_train')
masks = tf.sequence_mask(
lengths=trg_sequence_length,
maxlen=max_decoder_length,
dtype=tf.float32,
name='masks')
# place holder of label sequence
lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
# compute the loss
loss = seq2seq.sequence_loss(
logits=decoder_logits_train,
targets=lbl_word_idx,
weights=masks,
average_across_timesteps=True,
average_across_batch=True)
# return feeding list and loss operator
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_word_idx,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_word_idx
}, loss
else:
start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
tf.int32) * START_TOKEN_IDX
# share the same embedding weights with target word
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
inference_decoder = beam_search_decoder.BeamSearchDecoder(
cell=decoder_cell,
embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
start_tokens=start_tokens,
end_token=END_TOKEN_IDX,
initial_state=tf.nn.rnn_cell.LSTMStateTuple(
tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
beam_width=beam_size,
output_layer=output_layer)
decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
decoder=inference_decoder,
output_time_major=False,
#impute_finished=True,# error occurs
maximum_iterations=max_generation_length)
predicted_ids = decoder_outputs_decode.predicted_ids
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length
}, predicted_ids
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in vars(args).iteritems():
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def save(sess, path, var_list=None, global_step=None):
saver = tf.train.Saver(var_list)
save_path = saver.save(sess, save_path=path, global_step=global_step)
print('Model save at %s' % save_path)
def restore(sess, path, var_list=None):
# var_list = None returns the list of all saveable variables
saver = tf.train.Saver(var_list)
saver.restore(sess, save_path=path)
print('model restored from %s' % path)
def adapt_batch_data(data):
src_seq = map(lambda x: x[0], data)
trg_seq = map(lambda x: x[1], data)
lbl_seq = map(lambda x: x[2], data)
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
trg_sequence_length = np.array(
[len(seq) for seq in trg_seq]).astype('int32')
trg_seq_maxlen = np.max(trg_sequence_length)
src_seq = np.array(
[padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq]).astype('int32')
trg_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in trg_seq]).astype('int32')
lbl_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in lbl_seq]).astype('int32')
return {
'src_word_idx': src_seq,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_seq,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_seq
}
def train():
feeding_dict, loss = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=False,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
global_step = tf.Variable(0, trainable=False, name='global_step')
trainable_params = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
gradients = tf.gradients(loss, trainable_params)
# may clip the parameters
clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
updates = optimizer.apply_gradients(
zip(gradients, trainable_params), global_step=global_step)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
def do_validataion():
total_loss = 0.0
count = 0
for batch_id, data in enumerate(test_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
outputs = sess.run([loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
total_loss += outputs[0]
count += 1
return total_loss / count
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
words_seen += np.sum(adapted_batch_data['src_sequence_length'])
words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
outputs = sess.run([updates, loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
print("pass_id=%d, batch_id=%d, train_loss: %f" %
(pass_id, batch_id, outputs[1]))
pass_end_time = time.time()
test_loss = do_validataion()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
def infer():
feeding_dict, predicted_ids = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=True,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
with tf.Session(config=config) as sess:
restore(sess, './checkpoint/tf_seq2seq-1500')
for batch_id, data in enumerate(test_batch_generator()):
src_seq = map(lambda x: x[0], data)
source_language_seq = [
src_dict[item] for seq in src_seq for item in seq
]
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
src_seq = np.array([
padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq
]).astype('int32')
outputs = sess.run([predicted_ids],
feed_dict={
feeding_dict['src_word_idx']: src_seq,
feeding_dict['src_sequence_length']:
src_sequence_length
})
print("\nDecoder result comparison: ")
source_language_seq = ' '.join(source_language_seq).lstrip(
'<s>').rstrip('<e>').strip()
inference_seq = ''
print(" --> source: " + source_language_seq)
for item in outputs[0][0]:
if item[0] == END_TOKEN_IDX: break
inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
print(" --> inference: " + inference_seq)
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
if args.infer_only:
infer()
else:
train()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=5, help='The number of passes.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
args = parser.parse_args()
return args
def run_benchmark(args):
def weight_variable(dtype, shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
return tf.Variable(initial)
def bias_variable(dtype, shape):
initial = tf.constant(0.1, shape=shape, dtype=dtype)
return tf.Variable(initial)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
labels = tf.placeholder(tf.int64, shape=(None, ))
# conv1, relu, pool1
conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
conv1_bias = bias_variable(DTYPE, [20])
conv1 = tf.nn.conv2d(
images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
pool1 = tf.nn.max_pool(
relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# conv2, relu, pool2
conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
conv2_bias = bias_variable(DTYPE, [50])
conv2 = tf.nn.conv2d(
pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
pool2 = tf.nn.max_pool(
relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# FC
pool_shape = pool2.get_shape().as_list()
hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
fc_bias = bias_variable(DTYPE, [10])
logits = tf.matmul(reshape, fc_weights) + fc_bias
# Get prediction
prediction = tf.nn.softmax(logits)
# Loss
one_hot_labels = tf.one_hot(labels, depth=10)
cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
avg_cost = tf.reduce_mean(cost)
# Get accuracy
correct = tf.equal(tf.argmax(prediction, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# metrics, g_accuracy
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_accuracy = tf.metrics.accuracy(
labels, tf.argmax(
prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
g_accuracy_reset_op = tf.variables_initializer(vars)
# Optimizer
opt = tf.train.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
train_op = opt.minimize(avg_cost)
# train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
def eval_test():
sess.run(g_accuracy_reset_op)
for batch_id, data in enumerate(test_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
loss, acc, g_acc = sess.run(
[avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
return g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
for pass_id in range(args.pass_num):
sess.run(g_accuracy_reset_op)
pass_start = time.time()
for batch_id, data in enumerate(train_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
"int64")
start = time.time()
_, loss, acc, g_acc = sess.run(
[train_op, avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
end = time.time()
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
pass_end = time.time()
test_avg_acc = eval_test()
print(
"pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
% (pass_id, g_acc[1], test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
run_benchmark(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
Get help: python resnet.py --help
See performance on flowers: python resnet.py
Train on cifar10: python resnet.py --data=cifar10 --with_test
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import paddle.v2 as paddle
import tensorflow as tf
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser('Convolution model benchmark.')
parser.add_argument(
'--model',
type=str,
choices=['resnet'],
default='resnet',
help='The model architecture.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='use real data or fake data')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations',
type=int,
default=105,
help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=300, help='The number of passes.')
parser.add_argument(
'--order',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--data',
type=str,
default='flowers102',
choices=['flowers102', 'cifar10'],
help='The kinds of data.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
'with_test'] else vars(args)['iterations']
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def fixed_padding(inputs, kernel_size, data_format):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
[pad_beg, pad_end]])
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
"""Strided 2-D convolution with explicit padding."""
# The padding is consistent and is based only on `kernel_size`, not on the
# dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
# This is consistent with PaddlePaddle.
# In addition, the calculation for output size in TensorFlow can refer:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format)
return tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def conv_bn(inputs,
filters,
kernel_size,
strides,
is_training,
data_format,
act=True):
# def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
# set fused=True for a significant performance boost. See
# https://www.tensorflow.org/performance/performance_guide#common_fused_ops
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
data_format=data_format)
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if data_format == 'channels_first' else 3,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
if act:
inputs = tf.nn.relu(inputs)
return inputs
def basicblock(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = conv_bn(
inputs, filters * 4, 1, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
data_format):
# Bottleneck blocks end with 4x the number of filters as they start with
filters_out = 4 * filters if block_fn is bottleneck else filters
def projection_shortcut(inputs):
return conv2d_fixed_padding(
inputs=inputs,
filters=filters_out,
kernel_size=1,
strides=strides,
data_format=data_format)
# Only the first block per block_layer uses projection_shortcut and strides
inputs = block_fn(inputs, filters, is_training, projection_shortcut,
strides, data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
return tf.identity(inputs, name)
def resnet_imagenet(depth, class_dim, data_format):
"""Returns the ResNet model for a given size and number of output classes."""
def resnet_generator(block_fn,
layers,
num_classes,
data_format='channels_last'):
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
"""Constructs the ResNet model given the inputs."""
if data_format == 'channels_first':
# Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
# This provides a large performance boost on GPU. See
# https://www.tensorflow.org/performance/performance_guide#data_formats
inputs = tf.transpose(inputs, [0, 3, 1, 2])
inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = tf.layers.max_pooling2d(
inputs=inputs,
pool_size=3,
strides=2,
padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
is_training, 'block_layer1', data_format)
inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
is_training, 'block_layer2', data_format)
inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
is_training, 'block_layer3', data_format)
inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
is_training, 'block_layer4', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=7,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs,
[-1, 512 if block_fn is basicblock else 2048])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
model_params = {
18: {
'block': basicblock,
'layers': [2, 2, 2, 2]
},
34: {
'block': basicblock,
'layers': [3, 4, 6, 3]
},
50: {
'block': bottleneck,
'layers': [3, 4, 6, 3]
},
101: {
'block': bottleneck,
'layers': [3, 4, 23, 3]
},
152: {
'block': bottleneck,
'layers': [3, 8, 36, 3]
},
200: {
'block': bottleneck,
'layers': [3, 24, 36, 3]
}
}
if depth not in model_params:
raise ValueError('Not a valid depth:', depth)
params = model_params[depth]
return resnet_generator(params['block'], params['layers'], class_dim,
data_format)
def resnet_cifar10(depth, num_classes, data_format):
if depth % 6 != 2:
raise ValueError('depth must be 6n + 2:', depth)
num_blocks = (depth - 2) // 6
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
'block_layer1', data_format)
inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
'block_layer2', data_format)
inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
'block_layer3', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=8,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs, [-1, 64])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
"""Our model_fn for ResNet to be used with our Estimator."""
class_dim = 1000
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
if args.data == 'flowers102':
class_dim = 102
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
elif args.data == 'cifar10':
class_dim = 10
dshape = (None, 32, 32, 3)
pdshape = (3, 32, 32)
with tf.device(device):
images = tf.placeholder(DTYPE, shape=dshape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
network = resnet_cifar10(
32, class_dim,
data_format) if args.data == 'cifar10' else resnet_imagenet(
50, class_dim, data_format)
logits = network(inputs=images, is_training=is_training)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=onehot_labels)
avg_cost = tf.reduce_mean(cross_entropy)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
lr = 0.1 if args.data == 'cifar10' else 0.01
optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=100)
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, np.mean(test_accs)))
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
if args.use_fake_data:
data = train_reader().next()
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.pass_num):
if iters == args.iterations:
break
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
if not args.use_fake_data:
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_cost, accuracy],
feed_dict={
images: images_data,
labels: labels_data,
is_training: True
})
iters += 1
train_accs.append(acc)
train_losses.append(loss)
num_samples += len(data)
print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
print("Pass=%d, Loss=%f, Accuray=%f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
# evaluation
if args.with_test:
test()
if not args.with_test:
duration = time.time() - start_time
examples_per_sec = num_samples / duration
sec_per_batch = duration / (iters - args.skip_batch_num)
print('Total examples: %d, total time: %.5f' %
(num_samples, duration))
print('%.5f examples/sec, %.5f sec/batch' %
(examples_per_sec, sec_per_batch))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if tf.test.is_built_with_cuda():
device = '/device:GPU:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
data_format = 'channels_first'
else:
device = '/cpu:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
raise ValueError('Only support NHWC order in CPU mode')
run_benchmark(args, data_format, device)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import tensorflow as tf
import paddle.v2 as paddle
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--embedding_dim',
type=int,
default=512,
help='Dimension of embedding table. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=512,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=10,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.0002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstm_model(dict_size,
embedding_dim,
hidden_dim,
stacked_num,
class_num=2,
is_train=True):
word_idx = tf.placeholder(tf.int64, shape=[None, None])
sequence_length = tf.placeholder(tf.int64, shape=[None, ])
embedding_weights = tf.get_variable('word_embeddings',
[dict_size, embedding_dim])
embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
lstm_cell = tf.nn.rnn_cell.LSTMCell(
num_units=hidden_dim, use_peepholes=False)
stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
# final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
_, final_state = tf.nn.dynamic_rnn(
cell=stacked_cell,
inputs=embedding,
dtype=tf.float32,
sequence_length=sequence_length)
w = tf.Variable(
tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
bias = tf.Variable(
tf.constant(
value=0.0, shape=[class_num], dtype=tf.float32))
prediction = tf.matmul(final_state[-1][1], w) + bias
if not is_train:
return (word_idx, sequence_length), tf.nn.softmax(prediction)
label = tf.placeholder(tf.int64, shape=[None, ])
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(label, 2), logits=prediction)
avg_loss = tf.reduce_mean(loss)
correct_count = tf.equal(tf.argmax(prediction, 1), label)
acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
reset_op = tf.variables_initializer(vars)
return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def train(args):
word_dict = paddle.dataset.imdb.word_dict()
dict_size = len(word_dict)
feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
train_op = adam_optimizer.minimize(avg_loss)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=args.batch_size)
def do_validation(sess):
sess.run(reset_op)
for batch_id, data in enumerate(test_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
return fetch_g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
# clear accuracy local variable
sess.run(reset_op)
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
words_seen += np.sum(sequence_length)
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
% (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
test_acc = do_validation(sess)
print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_acc, words_per_sec, time_consumed))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
train(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark():
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss)
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
num_samples += len(data)
print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
run_benchmark()
...@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME) ...@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif() endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS} COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) ...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME) ...@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
endif() endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(hip_test) endfunction(hip_test)
......
# FileManager设计文档
## 目标
在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练
主要功能包括:
- 提供常用的命令行管理命令管理文件和目录
- 支持大文件的断点上传、下载
## 名词解释
- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。
- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。
- Chunk:逻辑划上文件分块的单位。
## 模块
### 架构图
<image src=./src/filemanager.png width=900>
### PFSClient
- 功能: 详细设计[link](./pfs/pfsclient.md)
- 提供用户管理文件的命令
- 需要可以跨平台执行
- 双向验证
PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>,所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。
### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
- 功能:
提供七层协议的反向代理、基于粘性会话的负载均衡功能。
- 透传用户身份的办法
Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
### PFSServer
PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。
RESTful API
- /api/v1/files
- `GET /api/v1/files`: Get metadata of files or directories.
- `POST /api/v1/files`: Create files or directories.
- `PATCH /api/v1/files`: Update files or directories.
- `DELETE /api/v1/files`: Delete files or directories.
- /api/v1/file/chunks
- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
- /api/v1/storage/files
- `GET /api/v1/storage/files`: Download files or directories.
- `POST /api/v1/storage/files`: Upload files or directories.
- /api/v1/storage/file/chunks
- `GET /api/v1/storage/file/chunks`: Download chunks's data.
- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
## 文件传输优化
### 分块文件传输
用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
一个典型的Chunk如下所示:
```
type Chunk struct {
fileOffset int64
checksum uint32
len uint32
data []byte
}
```
### 生成sparse文件
当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。
### 覆盖不一致的部分
文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
## 用户使用流程
参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
## 框架生成
[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。
## 参考文档
- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
- [linux man document](https://linux.die.net/man/)
# PFSClient
## Description
The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
## Synopsis
```
paddle [options] pfs <subcommand> [parameters]
```
## Options
```
--profile (string)
Use a specific profile from your credential file.
--help (string)
Display more information about command
--version
Output version information and exit
--debug
Show detailed debugging log
--only-show-errors (boolean)
Only errors and warnings are displayed. All other output is suppressed.
```
## Path Arguments
When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.
A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
## order of Path Arguments
Commonly, if there are two path arguments, the first is the source, and the second is the destination.
## Subcommonds
- rm - remove files or directories
```
Synopsis:
rm [-r] [-v] <PFSPath> ...
Options:
-r
Remove directories and their contents recursively
-v
Cause rm to be verbose, showing files after they are removed.
Examples:
paddle pfs rm /pfs/$DATACENTER/home/$USER/file
paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
```
- mv - move (rename) files
```
Synopsis:
mv [-f | -n] [-v] <LocalPath> <PFSPath>
mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
mv [-f | -n] [-v] <PFSPath> <LocalPath>
mv [-f | -n] [-v] <PFSPath> ... <LocalPath>
mv [-f | -n] [-v] <PFSPath> <PFSPath>
mv [-f | -n] [-v] <PFSPath> ... <PFSPath>
Options:
-f
Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.)
-n
Do not overwrite an existing file. (The -n option overrides previous -f options.)
-v
Cause mv to be verbose, showing files after they are moved.
Examples:
paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
```
- cp - copy files or directories
```
Synopsis:
cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
Options:
-r
Copy directories recursively
-f
Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.)
-n
Do not overwrite an existing file. (The -n option overrides previous -f options.)
-v
Cause cp to be verbose, showing files after they are copied.
--preserve--links
Reserve links when copy links
Examples:
paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
```
- ls- list files
```
Synopsis:
ls [-r] <PFSPath> ...
Options:
-R
List directory(ies) recursively
Examples:
paddle pfs ls /pfs/$DATACENTER/home/$USER/file
paddle pfs ls /pfs/$DATACENTER/home/$USER/folder
```
- mkdir - mkdir directory(ies)
Create intermediate directory(ies) as required.
```
Synopsis:
mkdir <PFSPath> ...
Examples:
paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder
```
...@@ -16,3 +16,4 @@ ...@@ -16,3 +16,4 @@
block.md block.md
scope.md scope.md
executor.md executor.md
parallel_executor.md
...@@ -16,3 +16,4 @@ Core Concepts ...@@ -16,3 +16,4 @@ Core Concepts
block.md block.md
scope.md scope.md
executor.md executor.md
parallel_executor.md
...@@ -9,5 +9,5 @@ ...@@ -9,5 +9,5 @@
use_eigen_cn.md use_eigen_cn.md
name_convention.md name_convention.md
support_new_device.md support_new_device.md
releasing_process.md releasing_process_cn.md
op_markdown_format.md op_markdown_format.md
...@@ -9,5 +9,5 @@ Development ...@@ -9,5 +9,5 @@ Development
use_eigen_en.md use_eigen_en.md
name_convention.md name_convention.md
support_new_device.md support_new_device.md
releasing_process.md releasing_process_en.md
op_markdown_format.md op_markdown_format.md
...@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* 使用Regression Test List作为检查列表,测试本次release的正确性。 * 使用Regression Test List作为检查列表,测试本次release的正确性。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True` * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`
* 编译这个版本的python wheel包,并发布到pypi。 * 将这个版本的python wheel包发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64` * 更新Docker镜像(参考后面的操作细节)。
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel` 1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
* 上传方法: 1. 协同完成Release Note的书写。
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
1. 协同完成Release Note的书写
需要注意的是: 需要注意的是:
...@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程:
## 发布wheel包到pypi ## 发布wheel包到pypi
使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以 完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后 弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。然后按照上述的方法 <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
使用`twine`工具上传即可。 1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。
1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png"> 1. 上传:
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux * 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
...@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程:
上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
版本号对应的tag即可: 版本号对应的tag即可:
1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。 ```
1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。 docker pull [镜像]:latest
1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]` docker tag [镜像]:latest [镜像]:[version]
1. 执行 `docker push paddlepaddle/paddle:[version]` docker push [镜像]:[version]
```
需要更新的镜像tag包括:
* `[version]`: CPU版本
* `[version]-openblas`: openblas版本
* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5)
* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
## PaddlePaddle 分支规范 ## PaddlePaddle 分支规范
...@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- ...@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
### PaddlePaddle Book中所有章节 ### PaddlePaddle Book中所有章节
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。
<table> <table>
<thead> <thead>
......
# PaddlePaddle Releasing Process
PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
Each time we release a new PaddlePaddle version, we should follow the below steps:
1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
1. After that, we should do:
* Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
that this release has no major bugs.
* If regression test fails, we must fix those bugs and create a new `release/[version]`
branch from previous release branch.
* Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
* Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
* Update the Docker images (see below instructions for detail).
1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
then merge `master` to `develop`.
1. Update the Release Note.
***NOTE:***
* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
features only for current release, so that we can test on that version.
* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
## Publish Wheel Packages to pypi
1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
to build all wheel packages needed to publish. As shown in the following picture, choose a build
version, click "..." button on the right side of "Run" button, and switch to the second tab in the
pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
step to start different versions of builds.
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
upload the package using `twine`, we need to rename the package from `linux_x86_64` to
`manylinux1_x86_64`.
1. Start the upload:
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
scripts under `tools/manylinux1`.
* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
old version. you must change the version number before upload a new one.
## Publish Docker Images
Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
```
docker pull [image]:latest
docker tag [image]:latest [image]:[version]
docker push [image]:[version]
```
Tags that need to be updated are:
* `[version]`: CPU only version image
* `[version]-openblas`: openblas version image
* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5)
* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
## Branching Model
We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
with some modifications:
* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
regression tests are run.
* `release/[version]` branch is used to publish each release. Latest release version branches have
bugfix only for that version, but no feature updates.
* Developer forks are not required to follow
[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
branching model, all forks is like a feature branch.
* Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
* Advise: developer use it's fork's develop branch to for new branch to start developing.
* Use that branch on developer's fork to create pull requests and start reviews.
* developer can push new commits to that branch when the pull request is open.
* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
`master`, `develop` and `releases`.
## PaddlePaddle Regression Test List
### All Chapters of PaddlePaddle Book
We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
V1 (`paddle_trainer` training) and V2 training and Fluid training.
<table>
<thead>
<tr>
<th></th>
<th>Linear Regression</th>
<th>Recognize Digits</th>
<th>Image Classification</th>
<th>Word2Vec</th>
<th>Personalized Recommendation</th>
<th>Sentiment Analysis</th>
<th>Semantic Role Labeling</th>
<th>Machine Translation</th>
</tr>
</thead>
<tbody>
<tr>
<td>API.V2 + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>API.V2 + Ubuntu + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + CPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</tbody>
</table>
.timestamp
*.o *.o
*.a *.a
.svn .svn
......
...@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) ...@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
if(WITH_GPU) if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
...@@ -21,9 +21,9 @@ endif() ...@@ -21,9 +21,9 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
......
...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name); "Tensor %s contains NAN", name);
} }
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
int block_id) {
auto& global_block = pdesc.Block(block_id);
const Scope* ancestor_scope = scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
if (ancestor_scope != scope) {
for (auto& var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
...@@ -184,8 +221,8 @@ static bool has_fetch_operators( ...@@ -184,8 +221,8 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope, void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name, bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name, bool create_vars) { const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId); platform::RecordBlock b(kProgramId);
bool has_feed_ops = bool has_feed_ops =
has_feed_operators(program.Block(0), feed_targets, feed_holder_name); has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
local_scope = &scope->NewScope(); local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
} }
CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
} }
} // if (create_local_scope)
} // if (create_vars)
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
......
...@@ -54,9 +54,9 @@ class Executor { ...@@ -54,9 +54,9 @@ class Executor {
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
bool create_vars = true,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch", const std::string& fetch_holder_name = "fetch");
bool create_vars = true);
static std::unique_ptr<ExecutorPrepareContext> Prepare( static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id); const ProgramDesc& program, int block_id);
...@@ -64,6 +64,8 @@ class Executor { ...@@ -64,6 +64,8 @@ class Executor {
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare( static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids); const ProgramDesc& program, const std::vector<int>& block_ids);
void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/profiler.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -24,6 +23,7 @@ limitations under the License. */ ...@@ -24,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate { ...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
#endif #endif
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_;
}
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
size_t num_threads, bool use_event, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::unordered_set<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, bool allow_op_delay) const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
// Step 1. RunStartupProgram and Bcast the params to devs. // Step 1. Bcast the params to devs.
Executor exe(places[0]);
exe.Run(startup_program, scope, 0);
// Create local scopes // Create local scopes
if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope()); member_->local_scopes_.push_back(&scope->NewScope());
} }
} else {
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
}
}
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif #endif
if (platform::is_gpu_place(places[0]) && if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
member_->local_scopes_.size() != 1) { // Is CUDA local_scopes.empty()) { // Is CUDA
BCastParamsToGPUs(startup_program); BCastParamsToGPUs(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const ProgramDesc &startup_program) const { const std::unordered_set<std::string> &vars) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *main_scope = member_->local_scopes_[0]; auto *main_scope = member_->local_scopes_[0];
for (auto *var_desc : startup_program.Block(0).AllVars()) { for (auto &var : vars) {
size_t idx = var_desc->Name().find("@GRAD"); auto *main_var = main_scope->FindVar(var);
if (idx != std::string::npos) continue; if (!main_var->IsType<LoDTensor>()) {
if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { continue;
auto &main_tensor = }
main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
auto &main_tensor = main_var->Get<LoDTensor>();
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
...@@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs(
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
...@@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs(
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
t->mutable_data(cpu, main_tensor.type()); t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t); paddle::framework::TensorCopy(main_tensor, cpu, t);
} }
} }
}
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
} }
#else #else
......
...@@ -36,11 +36,14 @@ class ParallelExecutor { ...@@ -36,11 +36,14 @@ class ParallelExecutor {
explicit ParallelExecutor(size_t num_threads, bool use_event, explicit ParallelExecutor(size_t num_threads, bool use_event,
const std::vector<platform::Place>& places, const std::vector<platform::Place>& places,
const std::unordered_set<std::string>& params, const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program, const std::unordered_set<std::string>& bcast_vars,
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes,
bool allow_op_delay); bool allow_op_delay);
std::vector<Scope*>& GetLocalScopes();
void Run(const std::vector<std::string>& fetch_tensors, void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name, const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors); const std::unordered_map<std::string, LoDTensor>& feed_tensors);
...@@ -51,7 +54,7 @@ class ParallelExecutor { ...@@ -51,7 +54,7 @@ class ParallelExecutor {
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const; void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
}; };
} // namespace framework } // namespace framework
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <mutex> // for call_once
#include <set> #include <set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
...@@ -39,6 +38,7 @@ Scope::~Scope() { ...@@ -39,6 +38,7 @@ Scope::~Scope() {
} }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
...@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) { void Scope::DeleteScope(Scope* scope) {
std::unique_lock<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
...@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
} }
} }
void Scope::EraseVars(std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <list> #include <list>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -51,13 +52,13 @@ class Scope { ...@@ -51,13 +52,13 @@ class Scope {
/// Create a variable with a scope-unique name. /// Create a variable with a scope-unique name.
Variable* Var(std::string* name = nullptr); Variable* Var(std::string* name = nullptr);
void EraseVars(std::vector<std::string>& var_names); void EraseVars(const std::vector<std::string>& var_names);
/// Find a variable in the scope or any of its ancestors. Returns /// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find. /// nullptr if cannot find.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
const Scope& parent() const { return *parent_; } const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
const Scope* FindScope(const Variable* var) const; const Scope* FindScope(const Variable* var) const;
...@@ -88,6 +89,9 @@ class Scope { ...@@ -88,6 +89,9 @@ class Scope {
Scope const* parent_{nullptr}; Scope const* parent_{nullptr};
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private:
mutable std::mutex mutex_;
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init) set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
......
...@@ -46,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1, TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -57,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2, TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place> template <typename Place, bool CreateVars = true>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname, ...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname,
// 6. Run the inference program // 6. Run the inference program
{ {
if (!CreateVars) {
// If users don't want to create and destroy variables every time they
// run, they need to set `create_vars` to false and manually call
// `CreateVariables` before running.
executor.CreateVariables(*inference_program, scope, 0);
}
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname, ...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname,
"run_inference", "run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
} }
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
......
add_subdirectory(detail) add_subdirectory(detail)
cc_library(memory SRCS memory.cc DEPS place enforce) cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(paddle_memory cc_library(memory
DEPS DEPS
memory malloc
memcpy memcpy)
meta_data
meta_cache
memory_block
buddy_allocator
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
#if (WITH_GPU) #if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place paddle_memory) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif() #endif()
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
if(${WITH_GPU}) if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
else(${WITH_GPU}) else(${WITH_GPU})
...@@ -6,10 +8,4 @@ endif(${WITH_GPU}) ...@@ -6,10 +8,4 @@ endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(meta_data SRCS meta_data.cc) cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
cc_library(meta_cache SRCS meta_cache.cc)
cc_library(memory_block SRCS memory_block.cc)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
...@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) { ...@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {
void* BuddyAllocator::Alloc(size_t unaligned_size) { void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment // adjust allocation alignment
size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
// acquire the allocator lock // acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
...@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
return; return;
} }
block->mark_as_free(cache_); block->mark_as_free(&cache_);
total_used_ -= block->total_size(cache_); total_used_ -= block->total_size(cache_);
total_free_ += block->total_size(cache_); total_free_ += block->total_size(cache_);
...@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
right_buddy)); right_buddy));
// merge its right buddy to the block // merge its right buddy to the block
block->merge(cache_, right_buddy); block->merge(&cache_, right_buddy);
} }
} }
...@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
left_buddy->total_size(cache_), left_buddy)); left_buddy->total_size(cache_), left_buddy));
// merge the block to its left buddy // merge the block to its left buddy
left_buddy->merge(cache_, block); left_buddy->merge(&cache_, block);
block = left_buddy; block = left_buddy;
} }
} }
...@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; } ...@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }
void* BuddyAllocator::SystemAlloc(size_t size) { void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(index, size); void* p = system_allocator_->Alloc(&index, size);
VLOG(10) << "Allocated " << p << " from system allocator."; VLOG(10) << "Allocated " << p << " from system allocator.";
if (p == nullptr) return nullptr; if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr); size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data(); return static_cast<MemoryBlock*>(p)->data();
...@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
// Allocate a new maximum sized block // Allocate a new maximum sized block
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(index, max_chunk_size_); void* p = system_allocator_->Alloc(&index, max_chunk_size_);
if (p == nullptr) return pool_.end(); if (p == nullptr) return pool_.end();
VLOG(10) << "Creating and inserting new block " << p VLOG(10) << "Creating and inserting new block " << p
<< " from system allocator"; << " from system allocator";
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr); max_chunk_size_, nullptr, nullptr);
// gpu fallback allocation // gpu fallback allocation
...@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, ...@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into"; << ") into";
block->split(cache_, size); block->split(&cache_, size);
VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")"; << ")";
block->set_type(cache_, MemoryBlock::ARENA_CHUNK); block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
// the rest of memory if exist // the rest of memory if exist
if (block->has_right_buddy(cache_)) { if (block->has_right_buddy(cache_)) {
......
...@@ -14,18 +14,18 @@ limitations under the License. */ ...@@ -14,18 +14,18 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/memory/detail/meta_cache.h" #include <mutex> // NOLINT
#include "paddle/fluid/memory/detail/meta_data.h" #include <set>
#include <tuple>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include <mutex>
#include <set>
#include <unordered_map>
#include <vector>
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
......
...@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and ...@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_cache.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy) { void* left_buddy, void* right_buddy) {
cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, cache->save(
this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
static_cast<MemoryBlock*>(left_buddy), static_cast<MemoryBlock*>(left_buddy),
static_cast<MemoryBlock*>(right_buddy))); static_cast<MemoryBlock*>(right_buddy)));
} }
MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
return cache.load(this).type; return cache.load(this).type;
} }
size_t MemoryBlock::size(MetadataCache& cache) const { size_t MemoryBlock::size(const MetadataCache& cache) const {
return cache.load(this).size; return cache.load(this).size;
} }
size_t MemoryBlock::total_size(MetadataCache& cache) const { size_t MemoryBlock::index(const MetadataCache& cache) const {
return cache.load(this).index;
}
size_t MemoryBlock::total_size(const MetadataCache& cache) const {
return cache.load(this).total_size; return cache.load(this).total_size;
} }
MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
return left_buddy(cache) != nullptr;
}
bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
return right_buddy(cache) != nullptr;
}
MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
return cache.load(this).left_buddy; return cache.load(this).left_buddy;
} }
MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
return cache.load(this).right_buddy; return cache.load(this).right_buddy;
} }
void MemoryBlock::split(MetadataCache& cache, size_t size) { void MemoryBlock::split(MetadataCache* cache, size_t size) {
// make sure the split fits // make sure the split fits
PADDLE_ASSERT(total_size(cache) >= size); PADDLE_ASSERT(total_size(*cache) >= size);
// bail out if there is no room for another partition // bail out if there is no room for another partition
if (total_size(cache) - size <= sizeof(Metadata)) { if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
return; return;
} }
// find the position of the split // find the position of the split
void* right_partition = reinterpret_cast<uint8_t*>(this) + size; void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
size_t remaining_size = total_size(cache) - size; size_t remaining_size = total_size(*cache) - size;
// Add the new block as a buddy // Add the new block as a buddy
auto metadata = cache.load(this); auto metadata = cache->load(this);
// Write the metadata for the new block // Write the metadata for the new block
auto new_block_right_buddy = metadata.right_buddy; auto new_block_right_buddy = metadata.right_buddy;
cache.store( cache->save(static_cast<MemoryBlock*>(right_partition),
static_cast<MemoryBlock*>(right_partition), MemoryBlock::Desc(FREE_CHUNK, index(*cache),
Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), remaining_size - sizeof(MemoryBlock::Desc),
remaining_size, this, new_block_right_buddy)); remaining_size, this, new_block_right_buddy));
metadata.right_buddy = static_cast<MemoryBlock*>(right_partition); metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
metadata.size = size - sizeof(Metadata); metadata.size = size - sizeof(MemoryBlock::Desc);
metadata.total_size = size; metadata.total_size = size;
cache.store(this, metadata); cache->save(this, metadata);
// Write metadata for the new block's right buddy // Write metadata for the new block's right buddy
if (new_block_right_buddy != nullptr) { if (new_block_right_buddy != nullptr) {
auto buddy_metadata = cache.load(new_block_right_buddy); auto buddy_metadata = cache->load(new_block_right_buddy);
buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition); buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
cache.store(new_block_right_buddy, buddy_metadata); cache->save(new_block_right_buddy, buddy_metadata);
} }
} }
void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
// only free blocks can be merged // only free blocks can be merged
PADDLE_ASSERT(type(cache) == FREE_CHUNK); PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
auto metadata = cache.load(this); auto metadata = cache->load(this);
// link this->buddy's buddy // link this->buddy's buddy
metadata.right_buddy = right_buddy->right_buddy(cache); metadata.right_buddy = right_buddy->right_buddy(*cache);
// link buddy's buddy -> this // link buddy's buddy -> this
if (metadata.right_buddy != nullptr) { if (metadata.right_buddy != nullptr) {
auto buddy_metadata = cache.load(metadata.right_buddy); auto buddy_metadata = cache->load(metadata.right_buddy);
buddy_metadata.left_buddy = this; buddy_metadata.left_buddy = this;
cache.store(metadata.right_buddy, buddy_metadata); cache->save(metadata.right_buddy, buddy_metadata);
} }
metadata.size += right_buddy->total_size(cache); metadata.size += right_buddy->total_size(*cache);
metadata.total_size += right_buddy->total_size(cache); metadata.total_size += right_buddy->total_size(*cache);
cache.store(this, metadata); cache->save(this, metadata);
cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); cache->save(right_buddy,
MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
} }
void MemoryBlock::mark_as_free(MetadataCache& cache) { void MemoryBlock::mark_as_free(MetadataCache* cache) {
// check for double free or corruption // check for double free or corruption
PADDLE_ASSERT(type(cache) != FREE_CHUNK); PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
PADDLE_ASSERT(type(cache) != INVALID_CHUNK); PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
set_type(cache, FREE_CHUNK); set_type(cache, FREE_CHUNK);
} }
void MemoryBlock::set_type(MetadataCache& cache, Type t) { void MemoryBlock::set_type(MetadataCache* cache, Type t) {
auto metadata = cache.load(this); auto metadata = cache->load(this);
metadata.type = t; metadata.type = t;
cache->save(this, metadata);
cache.store(this, metadata);
}
bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
return left_buddy(cache) != nullptr;
}
bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
return right_buddy(cache) != nullptr;
}
size_t MemoryBlock::index(MetadataCache& cache) const {
return cache.load(this).index;
} }
void* MemoryBlock::data() const { void* MemoryBlock::data() const {
return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1; return const_cast<MemoryBlock::Desc*>(
reinterpret_cast<const MemoryBlock::Desc*>(this)) +
1;
} }
MemoryBlock* MemoryBlock::metadata() const { MemoryBlock* MemoryBlock::metadata() const {
return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>( return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
reinterpret_cast<const Metadata*>(this) - 1)); reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
} }
} // namespace detail } // namespace detail
......
...@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cstddef> #include <cstdint>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
// Forward Declarations // Forward declaration.
class MetadataCache; class MetadataCache;
/*! \brief A class used to interpret the contents of a memory block */ // MemoryBlock represents Each allocated memory block, which contains
class MemoryBlock { // MemoryBlock::Desc and the payload.
public: struct MemoryBlock {
enum Type { enum Type {
FREE_CHUNK, // memory is free and idle FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied ARENA_CHUNK, // memory is being occupied
...@@ -33,57 +33,96 @@ class MemoryBlock { ...@@ -33,57 +33,96 @@ class MemoryBlock {
INVALID_CHUNK // memory is invalid INVALID_CHUNK // memory is invalid
}; };
public: // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
void init(MetadataCache& cache, Type t, size_t index, size_t size, // If it is a CPU memory block, the MetadataCache writes the
// MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
// block, the MetadataCache writes the Meatadata to a std::map in
// the CPU.
void init(MetadataCache* cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy); void* left_buddy, void* right_buddy);
public: // All these accessors returns fields in the MemoryBlock::Desc of the memory
/*! \brief The type of the allocation */ // block. They all need a MetadataCache instance as their first
Type type(MetadataCache& cache) const; // parameter because they read the MemoryBlock::Desc from the cache.
Type type(const MetadataCache& cache) const;
/*! \brief The size of the data region */ size_t size(const MetadataCache& cache) const;
size_t size(MetadataCache& cache) const; size_t index(const MetadataCache& cache) const;
size_t total_size(const MetadataCache& cache) const;
bool has_left_buddy(const MetadataCache& cache) const;
bool has_right_buddy(const MetadataCache& cache) const;
MemoryBlock* left_buddy(const MetadataCache& cache) const;
MemoryBlock* right_buddy(const MetadataCache& cache) const;
/*! \brief An index to track the allocator */ // Split the allocation into left/right blocks.
size_t index(MetadataCache& cache) const; void split(MetadataCache* cache, size_t size);
/*! \brief The total size of the block */ // Merge left and right blocks together.
size_t total_size(MetadataCache& cache) const; void merge(MetadataCache* cache, MemoryBlock* right_buddy);
/*! \brief Check the left buddy of the block */ // Mark the allocation as free.
bool has_left_buddy(MetadataCache& cache) const; void mark_as_free(MetadataCache* cache);
/*! \brief Check the right buddy of the block */ // Change the type of the allocation.
bool has_right_buddy(MetadataCache& cache) const; void set_type(MetadataCache* cache, Type t);
/*! \brief Get the left buddy */
MemoryBlock* left_buddy(MetadataCache& cache) const;
/*! \brief Get the right buddy */
MemoryBlock* right_buddy(MetadataCache& cache) const;
public:
/*! \brief Split the allocation into left/right blocks */
void split(MetadataCache& cache, size_t size);
/*! \brief Merge left and right blocks together */
void merge(MetadataCache& cache, MemoryBlock* right_buddy);
/*! \brief Mark the allocation as free */
void mark_as_free(MetadataCache& cache);
/*! \brief Change the type of the allocation */
void set_type(MetadataCache& cache, Type t);
public:
/*! \brief Get a pointer to the memory block's data */
void* data() const; void* data() const;
/*! \brief Get a pointer to the memory block's metadata */
MemoryBlock* metadata() const; MemoryBlock* metadata() const;
// MemoryBlock::Desc describes a MemoryBlock.
struct Desc {
Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
MemoryBlock* r);
Desc();
// Updates guard_begin and guard_end by hashes of the Metadata object.
void update_guards();
// Checks that guard_begin and guard_end are hashes of the Metadata object.
bool check_guards() const;
// TODO(gangliao): compress this
size_t guard_begin = 0;
MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
size_t index = 0;
size_t size = 0;
size_t total_size = 0;
MemoryBlock* left_buddy = nullptr;
MemoryBlock* right_buddy = nullptr;
size_t guard_end = 0;
};
};
// A cache for accessing memory block meta-data that may be expensive
// to access directly. This class exists to unify the
// MemoryBlock::Desc format between GPU and CPU allocations. It should
// be removed when the CPU can access all GPU allocations directly via
// UVM.
class MetadataCache {
public: public:
static size_t overhead(); explicit MetadataCache(bool uses_gpu);
// Disable copying and assignment.
MetadataCache(const MetadataCache&) = delete;
MetadataCache& operator=(const MetadataCache&) = delete;
// Returns the MemoryBlock::Desc for a memory block. When MetadataCache is
// used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
// of the memory block; when used to manage GPU memory, the
// Meatadata resides in CPU memory indexed by cache_.
MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
// Saves the MemoryBlock::Desc of a memory block into the cache. For CPU
// memory block, writes the MemoryBlock::Desc to the beginning of the memory
// block; whereas for GPU memory, writes it to cache_.
void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
// For GPU memory block, erases its MemoryBlock::Desc from cache_.
void invalidate(MemoryBlock* memory_block);
private:
typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
MetadataMap cache_;
bool uses_gpu_;
}; };
} // namespace detail } // namespace detail
......
...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/meta_data.h"
#include <functional> #include <functional>
#include "paddle/fluid/memory/detail/memory_block.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
MemoryBlock* l, MemoryBlock* r) MemoryBlock* l, MemoryBlock* r)
: type(t), : type(t),
index(i), index(i),
...@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, ...@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
left_buddy(l), left_buddy(l),
right_buddy(r) {} right_buddy(r) {}
Metadata::Metadata() MemoryBlock::Desc::Desc()
: type(MemoryBlock::INVALID_CHUNK), : type(MemoryBlock::INVALID_CHUNK),
index(0), index(0),
size(0), size(0),
...@@ -37,32 +37,36 @@ Metadata::Metadata() ...@@ -37,32 +37,36 @@ Metadata::Metadata()
left_buddy(nullptr), left_buddy(nullptr),
right_buddy(nullptr) {} right_buddy(nullptr) {}
namespace {
template <class T> template <class T>
inline void hash_combine(std::size_t& seed, const T& v) { inline void hash_combine(std::size_t* seed, const T& v) {
std::hash<T> hasher; std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
} }
inline size_t hash(const Metadata* metadata, size_t initial_seed) { inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
size_t seed = initial_seed; size_t seed = initial_seed;
hash_combine(seed, (size_t)metadata->type); hash_combine(&seed, static_cast<size_t>(metadata.type));
hash_combine(seed, metadata->index); hash_combine(&seed, metadata.index);
hash_combine(seed, metadata->size); hash_combine(&seed, metadata.size);
hash_combine(seed, metadata->total_size); hash_combine(&seed, metadata.total_size);
hash_combine(seed, metadata->left_buddy); hash_combine(&seed, metadata.left_buddy);
hash_combine(seed, metadata->right_buddy); hash_combine(&seed, metadata.right_buddy);
return seed; return seed;
} }
void Metadata::update_guards() { } // namespace
guard_begin = hash(this, 1);
guard_end = hash(this, 2); void MemoryBlock::Desc::update_guards() {
guard_begin = hash(*this, 1);
guard_end = hash(*this, 2);
} }
bool Metadata::check_guards() const { bool MemoryBlock::Desc::check_guards() const {
return guard_begin == hash(this, 1) && guard_end == hash(this, 2); return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
} }
} // namespace detail } // namespace detail
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/meta_cache.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
...@@ -23,29 +22,28 @@ namespace detail { ...@@ -23,29 +22,28 @@ namespace detail {
MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
Metadata MetadataCache::load(const MemoryBlock* block) { MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
if (uses_gpu_) { if (uses_gpu_) {
auto existing_metadata = cache_.find(block); auto existing_desc = cache_.find(block);
PADDLE_ASSERT(existing_metadata->second.check_guards()); PADDLE_ASSERT(existing_desc->second.check_guards());
return existing_metadata->second; return existing_desc->second;
} else { } else {
auto* meta = reinterpret_cast<const Metadata*>(block); auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
VLOG(10) << "Load MetaData type=" << meta->type; VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
PADDLE_ASSERT(meta->check_guards()); PADDLE_ASSERT(desc->check_guards());
return *reinterpret_cast<const Metadata*>(block); return *reinterpret_cast<const MemoryBlock::Desc*>(block);
} }
} }
void MetadataCache::store(MemoryBlock* block, void MetadataCache::save(MemoryBlock* block,
const Metadata& original_metadata) { const MemoryBlock::Desc& original_desc) {
auto metadata = original_metadata; auto desc = original_desc;
desc.update_guards();
metadata.update_guards();
if (uses_gpu_) { if (uses_gpu_) {
cache_[block] = metadata; cache_[block] = desc;
} else { } else {
*reinterpret_cast<Metadata*>(block) = metadata; *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include <unordered_map>
namespace paddle {
namespace memory {
namespace detail {
/**
* \brief A cache for accessing memory block meta-data that may be expensive
* to access directly.
*
* \note This class exists to unify the metadata format between GPU and CPU
* allocations. It should be removed when the CPU can access all GPU
* allocations directly via UVM.
*/
class MetadataCache {
public:
explicit MetadataCache(bool uses_gpu);
public:
/*! \brief Load the associated metadata for the specified memory block. */
Metadata load(const MemoryBlock* memory_block);
/*! \brief Store the associated metadata for the specified memory block. */
void store(MemoryBlock* memory_block, const Metadata& meta_data);
/*! \brief Indicate that the specified metadata will no longer be used. */
void invalidate(MemoryBlock* memory_block);
public:
MetadataCache(const MetadataCache&) = delete;
MetadataCache& operator=(const MetadataCache&) = delete;
private:
bool uses_gpu_;
private:
typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
private:
MetadataMap cache_;
};
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/detail/memory_block.h"
#include <stddef.h>
namespace paddle {
namespace memory {
namespace detail {
class Metadata {
public:
Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
MemoryBlock* r);
Metadata();
public:
/*! \brief Update the guards when metadata is changed */
void update_guards();
/*! \brief Check consistency to previous modification */
bool check_guards() const;
public:
// TODO(gangliao): compress this
// clang-format off
size_t guard_begin = 0;
MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
size_t index = 0;
size_t size = 0;
size_t total_size = 0;
MemoryBlock* left_buddy = nullptr;
MemoryBlock* right_buddy = nullptr;
size_t guard_end = 0;
// clang-format on
};
} // namespace detail
} // namespace memory
} // namespace paddle
...@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and ...@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include <stdlib.h> // for malloc and free #include <stdlib.h> // for malloc and free
#include <sys/mman.h> // for mlock and munlock #include <sys/mman.h> // for mlock and munlock
#include <algorithm> // for std::max #include <algorithm> // for std::max
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which // If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange // returns pinned and locked memory as staging areas for data exchange
...@@ -35,13 +35,13 @@ namespace paddle { ...@@ -35,13 +35,13 @@ namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
void* CPUAllocator::Alloc(size_t& index, size_t size) { void* CPUAllocator::Alloc(size_t* index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/, // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned // malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr. // pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
index = 0; // unlock memory *index = 0; // unlock memory
void* p; void* p;
...@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { ...@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
if (p != nullptr) { if (p != nullptr) {
if (FLAGS_use_pinned_memory) { if (FLAGS_use_pinned_memory) {
index = 1; *index = 1;
mlock(p, size); // lock memory mlock(p, size); // lock memory
} }
} }
...@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; } ...@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void* GPUAllocator::Alloc(size_t& index, size_t size) { void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does. // if size is 0. We just make sure it does.
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
...@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { ...@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
} }
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 0; *index = 0;
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
} else { } else {
...@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; } ...@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system // PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address. // memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
// NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
...@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { ...@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
cudaError_t result = cudaMallocHost(&p, size); cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
return p; return p;
} else { } else {
......
...@@ -29,14 +29,14 @@ namespace detail { ...@@ -29,14 +29,14 @@ namespace detail {
class SystemAllocator { class SystemAllocator {
public: public:
virtual ~SystemAllocator() {} virtual ~SystemAllocator() {}
virtual void* Alloc(size_t& index, size_t size) = 0; virtual void* Alloc(size_t* index, size_t size) = 0;
virtual void Free(void* p, size_t size, size_t index) = 0; virtual void Free(void* p, size_t size, size_t index) = 0;
virtual bool UseGpu() const = 0; virtual bool UseGpu() const = 0;
}; };
class CPUAllocator : public SystemAllocator { class CPUAllocator : public SystemAllocator {
public: public:
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
}; };
...@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator { ...@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator {
public: public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
...@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator { ...@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator {
class CUDAPinnedAllocator : public SystemAllocator { class CUDAPinnedAllocator : public SystemAllocator {
public: public:
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
......
...@@ -22,11 +22,11 @@ limitations under the License. */ ...@@ -22,11 +22,11 @@ limitations under the License. */
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
bool freed = false; bool freed = false;
{ {
size_t index; size_t index;
void* p = a.Alloc(index, size); void* p = a->Alloc(&index, size);
if (size > 0) { if (size > 0) {
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
} else { } else {
...@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { ...@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
int* i = static_cast<int*>(p); int* i = static_cast<int*>(p);
std::shared_ptr<int> ptr(i, [&](void* p) { std::shared_ptr<int> ptr(i, [&](void* p) {
freed = true; freed = true;
a.Free(p, size, index); a->Free(p, size, index);
}); });
} }
EXPECT_TRUE(freed); EXPECT_TRUE(freed);
...@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { ...@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
TEST(CPUAllocator, NoLockMem) { TEST(CPUAllocator, NoLockMem) {
FLAGS_use_pinned_memory = false; FLAGS_use_pinned_memory = false;
paddle::memory::detail::CPUAllocator a; paddle::memory::detail::CPUAllocator a;
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
TEST(CPUAllocator, LockMem) { TEST(CPUAllocator, LockMem) {
FLAGS_use_pinned_memory = true; FLAGS_use_pinned_memory = true;
paddle::memory::detail::CPUAllocator a; paddle::memory::detail::CPUAllocator a;
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(GPUAllocator, Alloc) { TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a(0); paddle::memory::detail::GPUAllocator a(0);
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
#endif #endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/malloc.h"
#include "glog/logging.h" #include "glog/logging.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
/**
* \brief Allocate memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/malloc.h"
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -28,7 +27,7 @@ inline bool is_aligned(void const *p) { ...@@ -28,7 +27,7 @@ inline bool is_aligned(void const *p) {
} }
size_t align(size_t size, paddle::platform::CPUPlace place) { size_t align(size_t size, paddle::platform::CPUPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CpuMinChunkSize(); size_t alignment = paddle::platform::CpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
...@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::CUDAPlace place) { size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::GpuMinChunkSize(); size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
...@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { ...@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
} }
size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) { size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CUDAPinnedMinChunkSize(); size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
......
...@@ -14,91 +14,5 @@ limitations under the License. */ ...@@ -14,91 +14,5 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
namespace paddle {
namespace memory {
/**
* \brief Allocate memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
......
...@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) ...@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
limitations under the License. */ limitations under the License. */
#include "mkldnn.hpp" #include "mkldnn.hpp"
#include "mkldnn_activation_op.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, ...@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace()); const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
// get memory dim // get memory dim
PADDLE_ENFORCE(src->dims().size() == 4, PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW"); "Input dim must be with 2 or 4");
std::vector<int> src_tz = framework::vectorize2int(src->dims()); std::vector<int> src_tz = framework::vectorize2int(src->dims());
// create memory description // create memory description
// TODO(kbinias-intel): support more formats auto data_md = src_tz.size() == 2
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nc)
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw); mkldnn::memory::format::nchw);
// create memory primitives // create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data); auto src_memory =
auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(src_data)));
auto dst_memory =
mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(dst_data)));
auto forward_desc = mkldnn::eltwise_forward::desc( auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
...@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, ...@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
std::vector<int> src_tz = framework::vectorize2int(x->dims()); std::vector<int> src_tz = framework::vectorize2int(x->dims());
// create memory description // create memory description
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, auto data_md = src_tz.size() == 2
? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nc)
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw); mkldnn::memory::format::nchw);
// create memory primitives // create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src); auto src_memory = mkldnn::memory(
{data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
auto diff_src_memory = auto diff_src_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(diff_src)));
auto diff_dst_memory = auto diff_dst_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(diff_dst)));
auto backward_desc = auto backward_desc =
mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta); mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/adagrad_op.h"
#include <vector>
#include <cmath> #include <cmath>
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/assign_value_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
std::vector<float> thresholds_list; std::vector<float> thresholds_list;
thresholds_list.reserve(num_thresholds); thresholds_list.reserve(num_thresholds);
for (int i = 1; i < num_thresholds - 1; i++) { for (int i = 1; i < num_thresholds - 1; i++) {
thresholds_list[i] = (float)i / (num_thresholds - 1); thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
} }
const float kEpsilon = 1e-7; const float kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon; thresholds_list[0] = 0.0f - kEpsilon;
...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace()); float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace()); float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) { for (int i = 0; i < num_thresholds; i++) {
tp_rate_data[i] = tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); (tp_data[i] + fn_data[i] + epsilon);
fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); fp_rate_data[i] =
rec_rate_data[i] = static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
(tp_data[i] + fp_data[i] + epsilon);
} }
*auc_data = 0.0f; *auc_data = 0.0f;
if (curve == "ROC") { if (curve == "ROC") {
......
...@@ -19,15 +19,15 @@ namespace operators { ...@@ -19,15 +19,15 @@ namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CPUDeviceContext>( void GetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
num_updates_ = in_num_updates->data<int64_t>()[0]; *num_updates_ = in_num_updates->data<int64_t>()[0];
} }
template <> template <>
......
...@@ -19,18 +19,18 @@ namespace paddle { ...@@ -19,18 +19,18 @@ namespace paddle {
namespace operators { namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CUDADeviceContext>( void GetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_, memory::Copy(platform::CPUPlace(), old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t), stream); sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream); in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t), stream); in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
} }
......
...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>; ...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext> template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx, void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates, int64_t& num_accumulates, int64_t* num_updates, int64_t* num_accumulates,
int64_t& old_num_accumulates); int64_t* old_num_accumulates);
template <typename DeviceContext> template <typename DeviceContext>
void SetAccumulators(const framework::ExecutionContext& ctx, void SetAccumulators(const framework::ExecutionContext& ctx,
...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
int64_t num_updates = 0; int64_t num_updates = 0;
int64_t num_accumulates = 0; int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0; int64_t old_num_accumulates = 0;
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates, GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
old_num_accumulates); &old_num_accumulates);
// Get attrs // Get attrs
float average_window = ctx.Attr<float>("average_window"); float average_window = ctx.Attr<float>("average_window");
......
...@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE) ...@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc) cares zlib protobuf sendrecvop_grpc)
cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
endif() endif()
...@@ -137,7 +137,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, ...@@ -137,7 +137,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req); SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
// var handle // var handle
VarHandle var_h; VarHandle var_h;
......
...@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase { ...@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase {
framework::Scope* scope, framework::Scope* scope,
const platform::DeviceContext* dev_ctx, const platform::DeviceContext* dev_ctx,
framework::Executor* executor, framework::Executor* executor,
framework::ProgramDesc* program, int blkid) framework::ProgramDesc* program,
framework::ExecutorPrepareContext* prefetch_ctx)
: RequestBase(service, cq, dev_ctx), : RequestBase(service, cq, dev_ctx),
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
executor_(executor), executor_(executor),
program_(program), program_(program),
blkid_(blkid) { prefetch_ctx_(prefetch_ctx) {
request_.reset(new VariableResponse(scope, dev_ctx_));
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable); int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
cq_, this); cq_, cq_, this);
} }
virtual ~RequestPrefetch() {} virtual ~RequestPrefetch() {}
virtual std::string GetReqName() { return request_.varname(); } virtual std::string GetReqName() { return request_->Varname(); }
virtual void Process() { virtual void Process() {
// prefetch process... // prefetch process...
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
// TODO(Yancey1989): execute the Block which containers prefetch ops
VLOG(3) << "RequestPrefetch Process in"; std::string var_name = request_->OutVarname();
auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name);
InitializeVariable(var, var_desc->GetType());
executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
responder_.Finish(reply, ::grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH; status_ = FINISH;
} }
protected: protected:
sendrecv::VariableMessage request_; std::shared_ptr<VariableResponse> request_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
framework::Scope* scope_; framework::Scope* scope_;
framework::Executor* executor_; framework::Executor* executor_;
framework::ProgramDesc* program_; framework::ProgramDesc* program_;
framework::ExecutorPrepareContext* prefetch_ctx_;
int blkid_; int blkid_;
}; };
...@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { ...@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
} }
RequestPrefetch* prefetch = RequestPrefetch* prefetch =
new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
executor_, program_, prefetch_blk_id_); executor_, program_, prefetch_ctx_);
VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
} }
......
...@@ -63,6 +63,10 @@ class AsyncGRPCServer final { ...@@ -63,6 +63,10 @@ class AsyncGRPCServer final {
void SetExecutor(framework::Executor *executor) { executor_ = executor; } void SetExecutor(framework::Executor *executor) { executor_ = executor; }
void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
prefetch_ctx_ = prepared;
}
int GetSelectedPort() { return selected_port_; } int GetSelectedPort() { return selected_port_; }
const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
...@@ -111,6 +115,7 @@ class AsyncGRPCServer final { ...@@ -111,6 +115,7 @@ class AsyncGRPCServer final {
std::unique_ptr<std::thread> t_prefetch_; std::unique_ptr<std::thread> t_prefetch_;
int prefetch_blk_id_; int prefetch_blk_id_;
framework::ExecutorPrepareContext *prefetch_ctx_;
framework::ProgramDesc *program_; framework::ProgramDesc *program_;
framework::Executor *executor_; framework::Executor *executor_;
int selected_port_; int selected_port_;
......
...@@ -20,43 +20,121 @@ limitations under the License. */ ...@@ -20,43 +20,121 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace detail = paddle::operators::detail; namespace detail = paddle::operators::detail;
USE_OP(lookup_table);
std::unique_ptr<detail::AsyncGRPCServer> rpc_service_; std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block);
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
framework::VariableNameMap output({{"Output", {"out"}}});
auto op = block->AppendOp();
op->SetType("lookup_table");
op->SetInput("W", {"w"});
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"});
auto& out = *root_block->Var("out");
out.SetType(framework::proto::VarType::SELECTED_ROWS);
out.SetShape({10, 10});
return block;
}
void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto w_var = scope->Var("w");
w_var->GetMutable<framework::SelectedRows>();
auto out_var = scope->Var("out");
out_var->GetMutable<framework::SelectedRows>();
auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::SelectedRows>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
auto rows = ids_var->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
ids_var->mutable_value()->Resize({rows_numel, 1});
ids_var->mutable_value()->mutable_data<float>(*place);
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
auto rows = w->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
auto w_value = w->mutable_value();
w_value->Resize({rows_numel, 10});
auto ptr = w_value->mutable_data<float>(*place);
for (int64_t i = 0; i < w_value->numel(); ++i) {
ptr[i] = static_cast<float>(i / 10);
}
}
void StartServer(const std::string& endpoint) { void StartServer(const std::string& endpoint) {
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
auto* block = AppendPrefetchBlcok(&program);
auto prepared = exe.Prepare(program, block->ID());
InitTensorsOnServer(&scope, &place, 10);
rpc_service_->SetProgram(&program);
rpc_service_->SetPrefetchPreparedCtx(prepared.get());
rpc_service_->SetDevCtx(&ctx);
rpc_service_->SetScope(&scope);
rpc_service_->SetExecutor(&exe);
rpc_service_->RunSyncUpdate(); rpc_service_->RunSyncUpdate();
} }
TEST(PREFETCH, CPU) { TEST(PREFETCH, CPU) {
// start up a server instance backend // start up a server instance backend
// TODO(Yancey1989): Need to start a server with optimize blocks and
// prefetch blocks.
std::thread server_thread(StartServer, "127.0.0.1:8889"); std::thread server_thread(StartServer, "127.0.0.1:8889");
sleep(2);
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
// create var on local scope // create var on local scope
std::string in_var_name("in"); int64_t rows_numel = 5;
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("ids");
std::string out_var_name("out"); std::string out_var_name("out");
auto* in_var = scope.Var(in_var_name);
auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
in_tensor->Resize({10, 10});
VLOG(3) << "before mutable_data";
in_tensor->mutable_data<int>(place);
scope.Var(out_var_name);
VLOG(3) << "before fetch";
detail::RPCClient client; detail::RPCClient client;
client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name, client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
out_var_name); out_var_name);
client.Wait(); client.Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::SelectedRows>()->value();
auto ptr = value.mutable_data<float>(place);
rpc_service_->ShutDown(); rpc_service_->ShutDown();
server_thread.join(); server_thread.join();
rpc_service_.reset(nullptr); rpc_service_.reset(nullptr);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
}
} }
...@@ -21,7 +21,7 @@ service SendRecvService { ...@@ -21,7 +21,7 @@ service SendRecvService {
rpc SendVariable(VariableMessage) returns (VoidMessage) {} rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname. // Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {} rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// Prefetch variable by Ids // pre-fetch variable by given variable name and Ids
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
} }
...@@ -67,6 +67,8 @@ message VariableMessage { ...@@ -67,6 +67,8 @@ message VariableMessage {
bytes serialized = 8; bytes serialized = 8;
// selected_rows data // selected_rows data
bytes rows = 9; bytes rows = 9;
// Look up table block execution output variable name.
string out_varname = 10;
} }
message VoidMessage {} message VoidMessage {}
...@@ -30,11 +30,9 @@ namespace detail { ...@@ -30,11 +30,9 @@ namespace detail {
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) { ::grpc::ByteBuffer* msg,
const std::string& out_name) {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
sendrecv::VariableMessage request;
std::string header;
request.AppendToString(&header);
// When using GPU, need to free the copied CPU buffer // When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies // when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent // TODO(typhoonzero): add unref here, if we have dependent
...@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteUint64(VarMsg::kTypeFieldNumber, 1); e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
} }
if (!out_name.empty()) {
e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
}
switch (framework::ToVarType(var->Type())) { switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR: { case framework::proto::VarType_Type_LOD_TENSOR: {
auto tensor = var->Get<framework::LoDTensor>(); auto tensor = var->Get<framework::LoDTensor>();
......
...@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*); ...@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg); ::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string());
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
......
...@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) { ...@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
} }
break; break;
} }
case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_out_varname(temp);
break;
}
default: { default: {
// Unknown tag, return unknown error. // Unknown tag, return unknown error.
......
...@@ -55,6 +55,7 @@ class VariableResponse { ...@@ -55,6 +55,7 @@ class VariableResponse {
int Parse(const ::grpc::ByteBuffer& byte_buffer); int Parse(const ::grpc::ByteBuffer& byte_buffer);
inline std::string Varname() { return meta_.varname(); } inline std::string Varname() { return meta_.varname(); }
inline std::string OutVarname() { return meta_.out_varname(); }
// should call parse first. // should call parse first.
framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); } framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
......
...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase { ...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
// TODO(varunarora): Consider moving this root scope lookup to scope.h. // TODO(varunarora): Consider moving this root scope lookup to scope.h.
const framework::Scope *root_scope = &scope; const framework::Scope *root_scope = &scope;
const framework::Scope *parent_scope = &(root_scope->parent()); const framework::Scope *parent_scope = root_scope->parent();
while (parent_scope != nullptr) { while (parent_scope != nullptr) {
root_scope = parent_scope; root_scope = parent_scope;
parent_scope = &(parent_scope->parent()); parent_scope = parent_scope->parent();
} }
framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock); framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
if (lod_t->lod().size() > 0) { if (lod_t->lod().size() > 0) {
auto y_lod = lod_t->lod(); auto y_lod = lod_t->lod();
auto last_level = y_lod[y_lod.size() - 1]; auto last_level = y_lod[y_lod.size() - 1];
PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0], PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
"Last value of `Y`'s last level LoD should be equal " "Last value of `Y`'s last level LoD should be equal "
"to the first dimension of `X`"); "to the first dimension of `X`");
out->set_lod(y_lod); out->set_lod(y_lod);
......
...@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase { ...@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
out->Reset( out->Reset(
new BatchReader(underlying_reader.Get(), Attr<int>("batch_size"))); new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
} }
......
...@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto place_str = Attr<std::string>("place"); auto place_str = Attr<std::string>("place");
platform::Place place; platform::Place place;
......
...@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase { ...@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
auto* out = detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto& out = detail::Ref(scope.FindVar(Output("Out")));
int pass_num = Attr<int>("pass_num"); int pass_num = Attr<int>("pass_num");
out.GetMutable<framework::ReaderHolder>()->Reset( out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
new MultiPassReader(underlying_reader.Get(), pass_num));
} }
}; };
......
...@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase { ...@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
auto* out = detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto& var = detail::Ref(scope.FindVar(Output("Out"))); out->Reset(
var.GetMutable<framework::ReaderHolder>()->Reset(
new ShuffleReader(underlying_reader.Get(), new ShuffleReader(underlying_reader.Get(),
static_cast<size_t>(Attr<int>("buffer_size")))); static_cast<size_t>(Attr<int>("buffer_size"))));
} }
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/spp_op.h" #include "paddle/fluid/operators/spp_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
......
...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unpool_op.h" #include "paddle/fluid/operators/unpool_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
......
...@@ -42,12 +42,12 @@ ENDIF() ...@@ -42,12 +42,12 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for # memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator cc_library(device_context SRCS device_context.cc DEPS malloc
system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mutex>
namespace paddle {
namespace platform {
/*
The current implementation of std::call_once has a bug described in
https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
This is likely caused by a deeper bug of pthread_once, which is discussed in
https://patchwork.ozlabs.org/patch/482350/
This wrap is a hack to avoid this bug.
*/
template <typename Callable, typename... Args>
inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
bool good = true;
std::exception ex;
try {
std::call_once(flag,
[&](Args&&... args) {
try {
f(args...);
} catch (const std::exception& e) {
ex = e;
good = false;
} catch (...) {
ex = std::runtime_error("excption caught in call_once");
good = false;
}
},
args...);
} catch (std::system_error& x) {
throw std::runtime_error("call once failed");
}
if (!good) {
throw std::exception(ex);
}
}
} // namespace platform
} // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/call_once.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
......
...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { ...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
// Conversion operator // Conversion operator
EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
EXPECT_EQ(float(float16(0.5f)), 0.5f); EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
EXPECT_EQ(int(float16(-1)), -1); EXPECT_EQ(static_cast<int>(float16(-1)), -1);
EXPECT_EQ(bool(float16(true)), true); EXPECT_EQ(static_cast<bool>(float16(true)), true);
} }
TEST(float16, arithmetic_cpu) { TEST(float16, arithmetic_cpu) {
EXPECT_EQ(float(float16(1) + float16(1)), 2); EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
EXPECT_EQ(float(float16(5) + float16(-5)), 0); EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
EXPECT_EQ(float(float16(3) - float16(5)), -2); 0.001);
EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); 0.33334f, 0.001);
EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
EXPECT_EQ(float(-float16(512.0f)), -512.0f); EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
EXPECT_EQ(float(-float16(-512.0f)), 512.0f); 0.001);
EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
} }
TEST(float16, comparison_cpu) { TEST(float16, comparison_cpu) {
......
...@@ -36,19 +36,19 @@ limitations under the License. */ ...@@ -36,19 +36,19 @@ limitations under the License. */
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, size); \ cudaMalloc(reinterpret_cast<void**>(&d_out), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (half*)malloc(size); \ out = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(out[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
free(out); \ free(out); \
...@@ -63,17 +63,17 @@ limitations under the License. */ ...@@ -63,17 +63,17 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \ op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(in1[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
cudaFree(d_in1); \ cudaFree(d_in1); \
...@@ -87,12 +87,12 @@ limitations under the License. */ ...@@ -87,12 +87,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, 1); \ cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (bool*)malloc(1); \ out = reinterpret_cast<bool*>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { ...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc((void**)&d_in, size); cudaMalloc(reinterpret_cast<void**>(&d_in), size);
in = (half*)malloc(size); in = reinterpret_cast<half*>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
cudaFree(d_in); cudaFree(d_in);
} }
......
...@@ -2,13 +2,13 @@ if(WITH_PYTHON) ...@@ -2,13 +2,13 @@ if(WITH_PYTHON)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
else() else()
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
......
...@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle.
[](ParallelExecutor &self, size_t num_threads, bool use_event, [](ParallelExecutor &self, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, bool allow_op_delay) { Scope *scope, std::vector<Scope *> &local_scopes,
new (&self) ParallelExecutor(num_threads, use_event, places, bool allow_op_delay) {
params, startup_program, main_program, new (&self)
loss_var_name, scope, allow_op_delay); ParallelExecutor(num_threads, use_event, places, params,
bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay);
}) })
.def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("run", &ParallelExecutor::Run); .def("run", &ParallelExecutor::Run);
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <Python.h>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
......
...@@ -6,6 +6,6 @@ if(WITH_TESTING) ...@@ -6,6 +6,6 @@ if(WITH_TESTING)
add_library(paddle_test_util STATIC TestUtil.cpp) add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
endif() endif()
endif() endif()
*pyc *pyc
build build
dist dist
paddlepaddle.egg-info
paddle.egg-info paddle.egg-info
paddlepaddle_gpu.egg-info paddlepaddle_gpu.egg-info
.idea .idea
......
...@@ -16,6 +16,7 @@ import sys ...@@ -16,6 +16,7 @@ import sys
import re import re
from graphviz import GraphPreviewGenerator from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
from google.protobuf import text_format
_vartype2str_ = [ _vartype2str_ = [
"UNK", "UNK",
...@@ -100,7 +101,7 @@ def repr_var(vardesc): ...@@ -100,7 +101,7 @@ def repr_var(vardesc):
def pprint_program_codes(program_desc): def pprint_program_codes(program_desc):
reprs = [] reprs = []
for block_idx in range(program_desc.num_blocks()): for block_idx in range(program_desc.desc.num_blocks()):
block_desc = program_desc.block(block_idx) block_desc = program_desc.block(block_idx)
block_repr = pprint_block_codes(block_desc) block_repr = pprint_block_codes(block_desc)
reprs.append(block_repr) reprs.append(block_repr)
...@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False): ...@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False):
if type(block_desc) is not framework_pb2.BlockDesc: if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString( block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string()) block_desc.desc.serialize_to_string())
var_reprs = [] var_reprs = []
op_reprs = [] op_reprs = []
for var in block_desc.vars: for var in block_desc.vars:
...@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
# draw parameters and args # draw parameters and args
vars = {} vars = {}
for var in desc.vars: for var in desc.vars:
shape = [str(i) for i in var.lod_tensor.tensor.dims] # TODO(gongwb): format the var.type
if not shape:
shape = ['null']
# create var # create var
if var.persistable: if var.persistable:
varn = graph.add_param( varn = graph.add_param(
var.name, var.type, shape, highlight=need_highlight(var.name)) var.name,
str(var.type).replace("\n", "<br />", 1),
highlight=need_highlight(var.name))
else: else:
varn = graph.add_arg(var.name, highlight=need_highlight(var.name)) varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
vars[var.name] = varn vars[var.name] = varn
...@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
for var in op.outputs: for var in op.outputs:
add_op_link_var(opn, var, True) add_op_link_var(opn, var, True)
graph(path, show=True) graph(path, show=False)
...@@ -102,6 +102,8 @@ def split_dense_variable(var_list, ...@@ -102,6 +102,8 @@ def split_dense_variable(var_list,
the parameter server side can gain better performance. By default the parameter server side can gain better performance. By default
minimum block size is 1024. The max block size is used to prevent minimum block size is 1024. The max block size is used to prevent
very large blocks that may cause send error. very large blocks that may cause send error.
:return: A list of VarBlocks. Each VarBlock specifies a shard of
the var.
""" """
blocks = [] blocks = []
for var in var_list: for var in var_list:
...@@ -192,22 +194,24 @@ class DistributeTranspiler: ...@@ -192,22 +194,24 @@ class DistributeTranspiler:
self.trainer_id = trainer_id self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",") pserver_endpoints = pservers.split(",")
# step1 # step1: For large parameters and gradients, split them into smaller
# blocks.
param_list = [pg[0] for pg in params_grads] param_list = [pg[0] for pg in params_grads]
grad_list = [pg[1] for pg in params_grads] grad_list = [pg[1] for pg in params_grads]
grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
# step2 # step2: Create new vars for the parameters and gradients blocks and
# add ops to do the split.
grad_var_mapping = self._append_split_op(program, grad_blocks) grad_var_mapping = self._append_split_op(program, grad_blocks)
# step3 param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
# step3: Add gradients as send op inputs and parameters as send
# op outputs.
send_inputs = [] send_inputs = []
send_outputs = [] send_outputs = []
for b in grad_blocks: # append by order for b in grad_blocks: # append by order
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_inputs.append(grad_var_mapping[varname][int(block_id)]) send_inputs.append(grad_var_mapping[varname][int(block_id)])
param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
for b in param_blocks: for b in param_blocks:
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_outputs.append(param_var_mapping[varname][int(block_id)]) send_outputs.append(param_var_mapping[varname][int(block_id)])
...@@ -237,7 +241,7 @@ class DistributeTranspiler: ...@@ -237,7 +241,7 @@ class DistributeTranspiler:
"RPCClient": rpc_client_var}, "RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": eplist}) "epmap": eplist})
# step4 # step4: Concat the parameters splits together after recv.
for varname, splited_var in param_var_mapping.iteritems(): for varname, splited_var in param_var_mapping.iteritems():
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
...@@ -258,13 +262,14 @@ class DistributeTranspiler: ...@@ -258,13 +262,14 @@ class DistributeTranspiler:
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
""" """
Get pserver side program using the endpoint. Get pserver side program using the endpoint.
TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
NOTE: assume blocks of the same variable is not distributed NOTE: assume blocks of the same variable is not distributed
on the same pserver, only change param/grad varnames for on the same pserver, only change param/grad varnames for
trainers to fetch. trainers to fetch.
""" """
# step1 # step1
pserver_program = Program() pserver_program = Program()
# step2 # step2: Create vars to receive vars at parameter servers.
recv_inputs = [] recv_inputs = []
for v in self.param_grad_ep_mapping[endpoint]["params"]: for v in self.param_grad_ep_mapping[endpoint]["params"]:
self._clone_var(pserver_program.global_block(), v) self._clone_var(pserver_program.global_block(), v)
...@@ -278,12 +283,6 @@ class DistributeTranspiler: ...@@ -278,12 +283,6 @@ class DistributeTranspiler:
orig_var_name = v.name[:suff_idx] orig_var_name = v.name[:suff_idx]
else: else:
orig_var_name = v.name orig_var_name = v.name
single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
if self.trainers > 1: if self.trainers > 1:
for trainer_id in xrange(self.trainers): for trainer_id in xrange(self.trainers):
var = pserver_program.global_block().create_var( var = pserver_program.global_block().create_var(
...@@ -294,6 +293,12 @@ class DistributeTranspiler: ...@@ -294,6 +293,12 @@ class DistributeTranspiler:
shape=v.shape) shape=v.shape)
recv_inputs.append(var) recv_inputs.append(var)
else: else:
single_trainer_var = pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
recv_inputs.append(single_trainer_var) recv_inputs.append(single_trainer_var)
# step3 # step3
...@@ -344,7 +349,7 @@ class DistributeTranspiler: ...@@ -344,7 +349,7 @@ class DistributeTranspiler:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
append_block = optimize_block append_block = optimize_block
# append lr decay ops to the child block if exits # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
if len(lr_ops) > 0: if len(lr_ops) > 0:
for _, op in enumerate(lr_ops): for _, op in enumerate(lr_ops):
...@@ -447,8 +452,10 @@ class DistributeTranspiler: ...@@ -447,8 +452,10 @@ class DistributeTranspiler:
block_list, block_list,
add_trainer_suffix=False): add_trainer_suffix=False):
""" """
Create vars for each split.
NOTE: only grads need to be named for different trainers, use NOTE: only grads need to be named for different trainers, use
add_trainer_suffix to rename the grad vars. add_trainer_suffix to rename the grad vars.
:return: A dict mapping from original var name to each var split.
""" """
block_map = dict() block_map = dict()
var_mapping = dict() var_mapping = dict()
...@@ -615,6 +622,7 @@ class DistributeTranspiler: ...@@ -615,6 +622,7 @@ class DistributeTranspiler:
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}) outputs={"Out": merged_var})
# TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op( optimize_block.append_op(
type="scale", type="scale",
...@@ -638,7 +646,7 @@ class DistributeTranspiler: ...@@ -638,7 +646,7 @@ class DistributeTranspiler:
shape=param_block.shape) shape=param_block.shape)
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
elif key == "LearningRate": elif key == "LearningRate":
# leraning rate variable has already be created by non-optimize op, # learning rate variable has already be created by non-optimize op,
# don't create it once again. # don't create it once again.
lr_varname = opt_op.input(key)[0] lr_varname = opt_op.input(key)[0]
if pserver_block.vars.has_key(lr_varname): if pserver_block.vars.has_key(lr_varname):
...@@ -773,6 +781,7 @@ class DistributeTranspiler: ...@@ -773,6 +781,7 @@ class DistributeTranspiler:
return False return False
def _get_input_map_from_op(self, varmap, op): def _get_input_map_from_op(self, varmap, op):
"""Returns a dict from op input name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.input_names: for key in op.input_names:
vars = [] vars = []
...@@ -785,6 +794,7 @@ class DistributeTranspiler: ...@@ -785,6 +794,7 @@ class DistributeTranspiler:
return iomap return iomap
def _get_output_map_from_op(self, varmap, op): def _get_output_map_from_op(self, varmap, op):
"""Returns a dict from op output name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.output_names: for key in op.output_names:
vars = [] vars = []
...@@ -812,6 +822,7 @@ class DistributeTranspiler: ...@@ -812,6 +822,7 @@ class DistributeTranspiler:
find_ops.append(op) find_ops.append(op)
# make a union find struct by the ops in default_main_program # make a union find struct by the ops in default_main_program
ufind = UnionFind(block.ops) ufind = UnionFind(block.ops)
for op1 in block.ops: for op1 in block.ops:
for op2 in block.ops: for op2 in block.ops:
# NOTE: we need to skip all optimize ops, since it is connected # NOTE: we need to skip all optimize ops, since it is connected
......
...@@ -640,12 +640,26 @@ class Operator(object): ...@@ -640,12 +640,26 @@ class Operator(object):
""" """
return self.desc.block_attr(name) return self.desc.block_attr(name)
def all_attrs(self):
"""
Get the attribute dict
Returns(dict): The Operator's attribute dict
"""
attr_names = self.attr_names
attr_map = {}
for n in attr_names:
if n == 'sub_block':
attr_map[n] = self.block_attr(n)
else:
attr_map[n] = self.attr(n)
return attr_map
class Block(object): class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = dict() # var_name --> var self.vars = dict() # var_name --> var
self.ops = collections.deque() # operator list self.ops = list() # operator list
self.program = program self.program = program
self.removed_vars = dict() self.removed_vars = dict()
...@@ -817,6 +831,13 @@ class Block(object): ...@@ -817,6 +831,13 @@ class Block(object):
self.ops.append(op) self.ops.append(op)
return op return op
def insert_op(self, index, *args, **kwargs):
self.sync_with_cpp()
op_desc = self.desc.insert_op(index)
op = Operator(block=self, desc=op_desc, *args, **kwargs)
self.ops.insert(index, op)
return op
def delete_ops(self, ops): def delete_ops(self, ops):
# remove from cpp # remove from cpp
# FIXME(typhoonzero): remove only the first occurrence. # FIXME(typhoonzero): remove only the first occurrence.
...@@ -828,17 +849,17 @@ class Block(object): ...@@ -828,17 +849,17 @@ class Block(object):
self.desc.remove_op(start, end + 1) self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end): def slice_ops(self, start, end):
return list(self.ops)[start:end] return self.ops[start:end]
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
self.ops.appendleft(op) self.ops.insert(0, op)
return op return op
def sync_with_cpp(self): def sync_with_cpp(self):
""" """
Sync with the desc on the c++ end. Sync from the desc on the c++ end.
This method is used to synchronize the c++ desc instance generated by backward. This method is used to synchronize the c++ desc instance generated by backward.
""" """
...@@ -878,7 +899,7 @@ class Block(object): ...@@ -878,7 +899,7 @@ class Block(object):
for index in range((start_index - 1 - 1), -1, -1): for index in range((start_index - 1 - 1), -1, -1):
op_desc = ops_in_cpp[index] op_desc = ops_in_cpp[index]
op = Operator(self, op_desc) op = Operator(self, op_desc)
self.ops.appendleft(op) self.ops.insert(0, op)
# sync ops append to the end of cpp_ops # sync ops append to the end of cpp_ops
for index in range((end_index + 1), len(ops_in_cpp)): for index in range((end_index + 1), len(ops_in_cpp)):
......
...@@ -83,7 +83,7 @@ class Graph(object): ...@@ -83,7 +83,7 @@ class Graph(object):
file = open(dot_path, 'w') file = open(dot_path, 'w')
file.write(self.__str__()) file.write(self.__str__())
image_path = os.path.join( image_path = os.path.join(
os.path.dirname(__file__), dot_path[:-3] + "pdf") os.path.dirname(dot_path), dot_path[:-3] + "pdf")
cmd = ["dot", "-Tpdf", dot_path, "-o", image_path] cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
subprocess.Popen( subprocess.Popen(
cmd, cmd,
...@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object): ...@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
else: else:
self.graph.show(path) self.graph.show(path)
def add_param(self, name, data_type, shape, highlight=False): def add_param(self, name, data_type, highlight=False):
label = '\n'.join([ label = '\n'.join([
'<<table cellpadding="5">', '<<table cellpadding="5">',
' <tr>', ' <tr>',
...@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object): ...@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
str(data_type), str(data_type),
' </td>' ' </td>'
' </tr>', ' </tr>',
' <tr>',
' <td>',
'[%s]' % 'x'.join(shape),
' </td>'
' </tr>',
'</table>>', '</table>>',
]) ])
return self.graph.node( return self.graph.node(
......
...@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var): ...@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var):
new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes()) new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True new_var.persistable = True
return monkey_patch_reader_methods(new_var) return new_var
def _copy_reader_create_op_(block, op):
input_param_names = op.input_names
new_input_map = {}
for param_name in input_param_names:
new_input_map[param_name] = []
arg_names = op.input(param_name)
for arg_name in arg_names:
new_input_map[param_name].append(block.var(arg_name))
output_param_names = op.output_names
new_output_map = {}
for param_name in output_param_names:
new_output_map[param_name] = []
arg_names = op.output(param_name)
for arg_name in arg_names:
new_output_map[param_name].append(block.var(arg_name))
new_op = block.append_op(
type=op.type,
inputs=new_input_map,
outputs=new_output_map,
attrs=op.all_attrs())
return new_op
def open_recordio_file(filename, shapes, lod_levels, dtypes): def open_recordio_file(filename, shapes, lod_levels, dtypes):
...@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes): ...@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
return monkey_patch_reader_methods(main_prog_var)
def open_files(filenames, thread_num, shapes, lod_levels, dtypes): def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
...@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes): ...@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
return monkey_patch_reader_methods(main_prog_var)
def __create_decorated_reader__(op_type, reader, attrs): def __create_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type) var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_var = startup_blk.create_var(name=var_name)
startup_blk.append_op( startop_op = startup_blk.append_op(
type=op_type, type=op_type,
inputs={'UnderlyingReader': reader}, inputs={'UnderlyingReader': reader},
outputs={'Out': [startup_var]}, outputs={'Out': [startup_var]},
attrs=attrs) attrs=attrs)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_block = default_main_program().current_block()
startup_var) main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
_copy_reader_create_op_(main_prog_block, startop_op)
return monkey_patch_reader_methods(main_prog_var)
def create_shuffle_reader(reader, buffer_size): def create_shuffle_reader(reader, buffer_size):
......
...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] ...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object): class ParallelExecutor(object):
def __init__(self, def __init__(self,
loss_name,
use_cuda, use_cuda,
loss_name=None,
main_program=None,
num_threads=None, num_threads=None,
allow_op_delay=False): allow_op_delay=False,
share_vars_from=None):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda(bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training.
main_program(Program, default None): The program that need to run,
if not provided, then default_main_program will be used.
num_threads(int, default None): How many threads are used for
training.
allow_op_delay(bool, default False): Whether to delay and buffer
some operators together for scheduling or not, which may
improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor.
Returns:
A ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor
object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
"""
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
...@@ -50,10 +89,21 @@ class ParallelExecutor(object): ...@@ -50,10 +89,21 @@ class ParallelExecutor(object):
else: else:
min(len(self._places) * 2, multiprocessing.cpu_count()) min(len(self._places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program() main = main_program
main = framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, main.list_vars())
]
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
num_threads, num_threads,
True if use_cuda else False, # use_event True if use_cuda else False, # use_event
...@@ -62,10 +112,11 @@ class ParallelExecutor(object): ...@@ -62,10 +112,11 @@ class ParallelExecutor(object):
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
startup.desc, set(persistable_vars),
main.desc, main.desc,
loss_name, loss_name if loss_name else '',
scope, scope,
local_scopes,
allow_op_delay) allow_op_delay)
self.scope = scope self.scope = scope
......
...@@ -535,9 +535,37 @@ class TestSwish(OpTest): ...@@ -535,9 +535,37 @@ class TestSwish(OpTest):
#--------------------test MKLDNN-------------------- #--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu): class TestMKLDNNReluDim2(TestRelu):
def setUp(self): def setUp(self):
super(TestMKLDNNRelu, self).setUp() super(TestMKLDNNReluDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim2(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim2(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim2(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNReluDim4(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu): ...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh): class TestMKLDNNTanhDim4(TestTanh):
def setUp(self): def setUp(self):
super(TestMKLDNNTanh, self).setUp() super(TestMKLDNNTanhDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh): ...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt): class TestMKLDNNSqrtDim4(TestSqrt):
def setUp(self): def setUp(self):
super(TestMKLDNNSqrt, self).setUp() super(TestMKLDNNSqrtDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt): ...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs): class TestMKLDNNAbsDim4(TestAbs):
def setUp(self): def setUp(self):
super(TestMKLDNNAbs, self).setUp() super(TestMKLDNNAbsDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
......
...@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase): ...@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
print(debuger.pprint_program_codes(p.desc)) print(debuger.pprint_program_codes(p))
debuger.draw_block_graphviz(p.block(0), path="./test.dot")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase):
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place)
startup_exe.run(startup)
exe = fluid.ParallelExecutor(True, loss_name=loss.name)
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count() batch_size *= fluid.core.get_cuda_device_count()
begin = time.time() begin = time.time()
...@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
@unittest.skip("transformer is buggy in multi gpu") @unittest.skip("transformer is buggy in multi gpu")
def test_main(self): def test_main(self):
self.check_network_convergence(transformer) self.check_network_convergence(transformer)
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.0001)
opt.minimize(loss)
batch_size = 32
image = numpy.random.normal(size=(batch_size,
784)).astype('float32')
label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
test_loss = numpy.array(test_loss)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
train_loss = numpy.array(train_loss)
self.assertTrue(numpy.allclose(train_loss, test_loss))
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle.v2 as paddle
import paddle.dataset.mnist as mnist import paddle.v2.dataset.mnist as mnist
class TestRecordIO(unittest.TestCase): class TestRecordIO(unittest.TestCase):
......
...@@ -107,6 +107,7 @@ package_dir={ ...@@ -107,6 +107,7 @@ package_dir={
# So that package points to other directory. # So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
} }
if '${WITH_FLUID_ONLY}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle' package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册