提交 6e2424e4 编写于 作者: A Abhinav Arora

Merge remote-tracking branch 'origin/develop' into cpplint_ops_a

...@@ -48,6 +48,13 @@ parser.add_argument( ...@@ -48,6 +48,13 @@ parser.add_argument(
type=int, type=int,
default=16, default=16,
help="The sequence number of a mini-batch data. (default: %(default)d)") help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
"--dict_size", "--dict_size",
type=int, type=int,
...@@ -72,16 +79,21 @@ parser.add_argument( ...@@ -72,16 +79,21 @@ parser.add_argument(
default=3, default=3,
help="The width for beam searching. (default: %(default)d)") help="The width for beam searching. (default: %(default)d)")
parser.add_argument( parser.add_argument(
"--use_gpu", '--device',
type=distutils.util.strtobool, type=str,
default=True, default='GPU',
help="Whether to use gpu. (default: %(default)d)") choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument( parser.add_argument(
"--max_length", "--max_length",
type=int, type=int,
default=250, default=250,
help="The maximum length of sequence when doing generation. " help="The maximum length of sequence when doing generation. "
"(default: %(default)d)") "(default: %(default)d)")
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
...@@ -281,7 +293,7 @@ def train(): ...@@ -281,7 +293,7 @@ def train():
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size) batch_size=args.batch_size)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -307,14 +319,20 @@ def train(): ...@@ -307,14 +319,20 @@ def train():
return total_loss / count return total_loss / count
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in xrange(args.pass_num): for pass_id in xrange(args.pass_num):
pass_start_time = time.time() train_accs = []
words_seen = 0 train_losses = []
for batch_id, data in enumerate(train_batch_generator()): for batch_id, data in enumerate(train_batch_generator()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
words_seen += word_num num_samples += word_num
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
words_seen += word_num num_samples += word_num
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
fetch_outs = exe.run(framework.default_main_program(), fetch_outs = exe.run(framework.default_main_program(),
...@@ -325,24 +343,36 @@ def train(): ...@@ -325,24 +343,36 @@ def train():
}, },
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(fetch_outs[0]) iters += 1
print('pass_id=%d, batch_id=%d, train_loss: %f' % loss = np.array(fetch_outs[0])
(pass_id, batch_id, avg_cost_val)) print(
"Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_end_time = time.time() train_elapsed = time.time() - start_time
test_loss = do_validation() examples_per_sec = num_samples / train_elapsed
time_consumed = pass_end_time - pass_start_time print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
words_per_sec = words_seen / time_consumed (num_samples, train_elapsed, examples_per_sec))
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % # evaluation
(pass_id, test_loss, words_per_sec, time_consumed)) if args.with_test:
test_loss = do_validation()
exit(0)
def infer(): def infer():
pass pass
def print_arguments(args):
print('----------- seq2seq Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args)
if args.infer_only: if args.infer_only:
infer() infer()
else: else:
......
...@@ -35,6 +35,12 @@ def parse_args(): ...@@ -35,6 +35,12 @@ def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.") parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.') '--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument( parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.') '--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
...@@ -53,19 +59,14 @@ def parse_args(): ...@@ -53,19 +59,14 @@ def parse_args():
'--use_nvprof', '--use_nvprof',
action='store_true', action='store_true',
help='If set, use nvprof for CUDA.') help='If set, use nvprof for CUDA.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def cnn_model(data): def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool( conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data, input=data,
...@@ -161,16 +162,22 @@ def run_benchmark(model, args): ...@@ -161,16 +162,22 @@ def run_benchmark(model, args):
paddle.dataset.mnist.train(), batch_size=args.batch_size) paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage() accuracy = fluid.average.WeightedAverage()
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
accuracy.reset() accuracy.reset()
pass_start = time.time() train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
img_data = np.array( img_data = np.array(
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([len(y_data), 1]) y_data = y_data.reshape([len(y_data), 1])
start = time.time()
outs = exe.run( outs = exe.run(
fluid.default_main_program(), fluid.default_main_program(),
feed={"pixel": img_data, feed={"pixel": img_data,
...@@ -178,21 +185,36 @@ def run_benchmark(model, args): ...@@ -178,21 +185,36 @@ def run_benchmark(model, args):
fetch_list=[avg_cost, batch_acc, batch_size_tensor] fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2]) accuracy.add(value=outs[1], weight=outs[2])
end = time.time() iters += 1
num_samples += len(y_data)
loss = np.array(outs[0]) loss = np.array(outs[0])
acc = np.array(outs[1]) acc = np.array(outs[1])
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % train_losses.append(loss)
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc))
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
pass_end = time.time() print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program)
exit(0)
train_avg_acc = accuracy.eval()
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
inference_program)
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % def print_arguments(args):
(pass_id, train_avg_acc, test_avg_acc, vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
(pass_end - pass_start) / 1000)) vars(args)['device'] == 'GPU')
print('----------- mnist Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -87,15 +87,6 @@ def parse_args(): ...@@ -87,15 +87,6 @@ def parse_args():
return args return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
conv1 = fluid.layers.conv2d( conv1 = fluid.layers.conv2d(
input=input, input=input,
...@@ -279,32 +270,31 @@ def run_benchmark(model, args): ...@@ -279,32 +270,31 @@ def run_benchmark(model, args):
'label': label}, 'label': label},
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
iters += 1 iters += 1
num_samples += label[0] num_samples += len(label)
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
(pass_id, iters, loss, acc)) (pass_id, iters, loss, acc))
pass_train_acc = accuracy.eval()
# evaluation
if args.with_test:
pass_test_acc = test(exe)
train_elapsed = time.time() - start_time
print("Pass: %d, Loss: %f, Train Accuray: %f\n" % print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs))) (pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec)) (num_samples, train_elapsed, examples_per_sec))
# evaluation
if args.with_test:
pass_test_acc = test(exe)
exit(0)
if args.use_cprof:
pr.disable() def print_arguments(args):
s = StringIO.StringIO() vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
sortby = 'cumulative' vars(args)['device'] == 'GPU')
ps = pstats.Stats(pr, stream=s).sort_stats(sortby) print('----------- resnet Configuration Arguments -----------')
ps.print_stats() for arg, value in sorted(vars(args).iteritems()):
print(s.getvalue()) print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
......
#!/bin/bash #!/bin/bash
# This script benchmarking the PaddlePaddle Fluid on # This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU. # single thread single GPU.
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
# disable openmp and mkl parallel # disable openmp and mkl parallel
#https://github.com/PaddlePaddle/Paddle/issues/7199 #https://github.com/PaddlePaddle/Paddle/issues/7199
...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
# only query the gpu used
nohup stdbuf -oL nvidia-smi \
--id=${CUDA_VISIBLE_DEVICES} \
--query-gpu=timestamp \
--query-compute-apps=pid,process_name,used_memory \
--format=csv \
--filename=mem.log \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
2>&1 | tee -a mnist_gpu_128.log
# vgg16 # vgg16
# cifar10 gpu cifar10 128 # gpu cifar10 128
FLAGS_benchmark=true python fluid/vgg.py \ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > vgg16_gpu_128.log 2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a vgg16_gpu_flowers_32.log
# resnet50 # resnet50
# resnet50 gpu cifar10 128 # resnet50 gpu cifar10 128
FLAGS_benchmark=true python fluid/resnet.py \ FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \ --device=GPU \
--batch_size=128 \ --batch_size=128 \
--data_set=cifar10 \ --data_set=cifar10 \
--model=resnet_cifar10 \ --model=resnet_cifar10 \
--skip_batch_num=5 \ --skip_batch_num=5 \
--iterations=30 \ --iterations=30 \
2>&1 > resnet50_gpu_128.log 2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm # lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
--hidden_dim=512 \
--emb_dim=512 \
--crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a lstm_gpu_128.log
...@@ -37,6 +37,14 @@ def parse_args(): ...@@ -37,6 +37,14 @@ def parse_args():
type=int, type=int,
default=32, default=32,
help='The sequence number of a batch data. (default: %(default)d)') help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument( parser.add_argument(
'--emb_dim', '--emb_dim',
type=int, type=int,
...@@ -64,6 +72,10 @@ def parse_args(): ...@@ -64,6 +72,10 @@ def parse_args():
default=int(os.environ.get('CROP_SIZE', '1500')), default=int(os.environ.get('CROP_SIZE', '1500')),
help='The max sentence length of input. Since this model use plain RNN,' help='The max sentence length of input. Since this model use plain RNN,'
' Gradient could be explored if sentence is too long') ' Gradient could be explored if sentence is too long')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -157,37 +169,43 @@ def main(): ...@@ -157,37 +169,43 @@ def main():
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
def train_loop(pass_num, crop_size): train_reader = batch(
with profiler.profiler(args.device, 'total') as prof: paddle.reader.shuffle(
for pass_id in range(pass_num): crop_sentence(imdb.train(word_dict), args.crop_size),
train_reader = batch( buf_size=25000),
paddle.reader.shuffle( batch_size=args.batch_size)
crop_sentence(imdb.train(word_dict), crop_size),
buf_size=25000), iters, num_samples, start_time = 0, 0, time.time()
batch_size=args.batch_size) for pass_id in range(args.pass_num):
word_nums = 0 train_accs = []
pass_start_time = time.time() train_losses = []
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
tensor_words = to_lodtensor([x[0] for x in data], place) if iters == args.skip_batch_num:
for x in data: start_time = time.time()
word_nums += len(x[0]) num_samples = 0
label = numpy.array([x[1] for x in data]).astype("int64") if iters == args.iterations:
label = label.reshape((-1, 1)) break
loss_np, acc, weight = exe.run( tensor_words = to_lodtensor([x[0] for x in data], place)
fluid.default_main_program(), label = numpy.array([x[1] for x in data]).astype("int64")
feed={"words": tensor_words, label = label.reshape((-1, 1))
"label": label}, loss_np, acc, weight = exe.run(
fetch_list=[loss, batch_acc, batch_size_tensor]) fluid.default_main_program(),
print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % feed={"words": tensor_words,
(pass_id, batch_id, loss_np, acc)) "label": label},
fetch_list=[loss, batch_acc, batch_size_tensor])
pass_end_time = time.time() iters += 1
time_consumed = pass_end_time - pass_start_time for x in data:
words_per_sec = word_nums / time_consumed num_samples += len(x[0])
print("pass_id=%d, sec/pass: %f, words/s: %f" % print(
(pass_id, time_consumed, words_per_sec)) "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss_np, acc)
train_loop(args.pass_num, args.crop_size) ) # The accuracy is the accumulation of batches, but not the current batch.
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
exit(0)
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -205,5 +223,14 @@ def to_lodtensor(data, place): ...@@ -205,5 +223,14 @@ def to_lodtensor(data, place):
return res return res
def print_arguments(args):
print('----------- lstm Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args()
print_arguments(args)
main() main()
...@@ -191,25 +191,29 @@ def main(): ...@@ -191,25 +191,29 @@ def main():
fetch_list=[avg_cost, batch_acc, batch_size_tensor]) fetch_list=[avg_cost, batch_acc, batch_size_tensor])
accuracy.add(value=acc, weight=weight) accuracy.add(value=acc, weight=weight)
iters += 1 iters += 1
num_samples += len(data) num_samples += len(y_data)
print( print(
"Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc) (pass_id, iters, loss, acc)
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_train_acc = accuracy.eval() # pass_train_acc = accuracy.eval()
train_losses.append(loss) train_losses.append(loss)
train_accs.append(acc) train_accs.append(acc)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
train_elapsed = time.time() - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
# evaluation # evaluation
if args.with_test: if args.with_test:
pass_test_acc = test(exe) pass_test_acc = test(exe)
train_elapsed = time.time() - start_time exit(0)
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
def print_arguments(): def print_arguments():
print('----------- Configuration Arguments -----------') print('----------- vgg Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()): for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value)) print('%s: %s' % (arg, value))
print('------------------------------------------------') print('------------------------------------------------')
......
.timestamp
*.o *.o
*.a *.a
.svn .svn
......
...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name); "Tensor %s contains NAN", name);
} }
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
int block_id) {
auto& global_block = pdesc.Block(block_id);
const Scope* ancestor_scope = scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
if (ancestor_scope != scope) {
for (auto& var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : global_block.AllVars()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
...@@ -184,8 +221,8 @@ static bool has_fetch_operators( ...@@ -184,8 +221,8 @@ static bool has_fetch_operators(
void Executor::Run(const ProgramDesc& program, Scope* scope, void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name, bool create_vars, const std::string& feed_holder_name,
const std::string& fetch_holder_name, bool create_vars) { const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId); platform::RecordBlock b(kProgramId);
bool has_feed_ops = bool has_feed_ops =
has_feed_operators(program.Block(0), feed_targets, feed_holder_name); has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
local_scope = &scope->NewScope(); local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) { }
if (var->Name() == framework::kEmptyVarName) { CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
continue; }
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} // if (create_local_scope)
} // if (create_vars)
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
......
...@@ -54,9 +54,9 @@ class Executor { ...@@ -54,9 +54,9 @@ class Executor {
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
bool create_vars = true,
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch", const std::string& fetch_holder_name = "fetch");
bool create_vars = true);
static std::unique_ptr<ExecutorPrepareContext> Prepare( static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id); const ProgramDesc& program, int block_id);
...@@ -64,6 +64,8 @@ class Executor { ...@@ -64,6 +64,8 @@ class Executor {
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare( static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids); const ProgramDesc& program, const std::vector<int>& block_ids);
void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/profiler.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -24,6 +23,7 @@ limitations under the License. */ ...@@ -24,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate { ...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
#endif #endif
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_;
}
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
size_t num_threads, bool use_event, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::unordered_set<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, bool allow_op_delay) const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
// Step 1. RunStartupProgram and Bcast the params to devs. // Step 1. Bcast the params to devs.
Executor exe(places[0]);
exe.Run(startup_program, scope, 0);
// Create local scopes // Create local scopes
for (size_t i = 0; i < member_->places_.size(); ++i) { if (local_scopes.empty()) {
member_->local_scopes_.push_back(&scope->NewScope()); for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope());
}
} else {
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
}
} }
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif #endif
if (platform::is_gpu_place(places[0]) && if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
member_->local_scopes_.size() != 1) { // Is CUDA local_scopes.empty()) { // Is CUDA
BCastParamsToGPUs(startup_program); BCastParamsToGPUs(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor( ...@@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const ProgramDesc &startup_program) const { const std::unordered_set<std::string> &vars) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *main_scope = member_->local_scopes_[0]; auto *main_scope = member_->local_scopes_[0];
for (auto *var_desc : startup_program.Block(0).AllVars()) { for (auto &var : vars) {
size_t idx = var_desc->Name().find("@GRAD"); auto *main_var = main_scope->FindVar(var);
if (idx != std::string::npos) continue; if (!main_var->IsType<LoDTensor>()) {
if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { continue;
auto &main_tensor = }
main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
auto &main_tensor = main_var->Get<LoDTensor>();
auto &dims = main_tensor.dims();
auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) {
size_t numel = main_tensor.numel(); if (paddle::platform::is_gpu_place(main_tensor.place())) {
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); size_t numel = main_tensor.numel();
platform::NCCLGroupGuard guard; ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
for (size_t i = 0; i < member_->places_.size(); ++i) { platform::NCCLGroupGuard guard;
auto place = member_->places_[i]; for (size_t i = 0; i < member_->places_.size(); ++i) {
void *buffer; auto place = member_->places_[i];
if (i == 0) { void *buffer;
buffer = const_cast<void *>(main_tensor.data<void>()); if (i == 0) {
} else { buffer = const_cast<void *>(main_tensor.data<void>());
auto local_scope = member_->local_scopes_[i]; } else {
auto *t =
local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type());
}
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
} else {
platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
t->mutable_data(cpu, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t);
} }
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
} else {
platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims);
t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t);
} }
} }
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
......
...@@ -36,11 +36,14 @@ class ParallelExecutor { ...@@ -36,11 +36,14 @@ class ParallelExecutor {
explicit ParallelExecutor(size_t num_threads, bool use_event, explicit ParallelExecutor(size_t num_threads, bool use_event,
const std::vector<platform::Place>& places, const std::vector<platform::Place>& places,
const std::unordered_set<std::string>& params, const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program, const std::unordered_set<std::string>& bcast_vars,
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes,
bool allow_op_delay); bool allow_op_delay);
std::vector<Scope*>& GetLocalScopes();
void Run(const std::vector<std::string>& fetch_tensors, void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name, const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors); const std::unordered_map<std::string, LoDTensor>& feed_tensors);
...@@ -51,7 +54,7 @@ class ParallelExecutor { ...@@ -51,7 +54,7 @@ class ParallelExecutor {
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const; void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
}; };
} // namespace framework } // namespace framework
......
...@@ -58,7 +58,7 @@ class Scope { ...@@ -58,7 +58,7 @@ class Scope {
/// nullptr if cannot find. /// nullptr if cannot find.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
const Scope& parent() const { return *parent_; } const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
const Scope* FindScope(const Variable* var) const; const Scope* FindScope(const Variable* var) const;
......
...@@ -46,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1, TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -57,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2, TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
FLAGS_repeat); cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place> template <typename Place, bool CreateVars = true>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname, ...@@ -166,8 +166,16 @@ void TestInference(const std::string& dirname,
// 6. Run the inference program // 6. Run the inference program
{ {
if (!CreateVars) {
// If users don't want to create and destroy variables every time they
// run, they need to set `create_vars` to false and manually call
// `CreateVariables` before running.
executor.CreateVariables(*inference_program, scope, 0);
}
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname, ...@@ -178,7 +186,8 @@ void TestInference(const std::string& dirname,
"run_inference", "run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars);
} }
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
......
...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase { ...@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
// TODO(varunarora): Consider moving this root scope lookup to scope.h. // TODO(varunarora): Consider moving this root scope lookup to scope.h.
const framework::Scope *root_scope = &scope; const framework::Scope *root_scope = &scope;
const framework::Scope *parent_scope = &(root_scope->parent()); const framework::Scope *parent_scope = root_scope->parent();
while (parent_scope != nullptr) { while (parent_scope != nullptr) {
root_scope = parent_scope; root_scope = parent_scope;
parent_scope = &(parent_scope->parent()); parent_scope = parent_scope->parent();
} }
framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock); framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
if (lod_t->lod().size() > 0) { if (lod_t->lod().size() > 0) {
auto y_lod = lod_t->lod(); auto y_lod = lod_t->lod();
auto last_level = y_lod[y_lod.size() - 1]; auto last_level = y_lod[y_lod.size() - 1];
PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0], PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
"Last value of `Y`'s last level LoD should be equal " "Last value of `Y`'s last level LoD should be equal "
"to the first dimension of `X`"); "to the first dimension of `X`");
out->set_lod(y_lod); out->set_lod(y_lod);
......
...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { ...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
// Conversion operator // Conversion operator
EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
EXPECT_EQ(float(float16(0.5f)), 0.5f); EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
EXPECT_EQ(int(float16(-1)), -1); EXPECT_EQ(static_cast<int>(float16(-1)), -1);
EXPECT_EQ(bool(float16(true)), true); EXPECT_EQ(static_cast<bool>(float16(true)), true);
} }
TEST(float16, arithmetic_cpu) { TEST(float16, arithmetic_cpu) {
EXPECT_EQ(float(float16(1) + float16(1)), 2); EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
EXPECT_EQ(float(float16(5) + float16(-5)), 0); EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
EXPECT_EQ(float(float16(3) - float16(5)), -2); 0.001);
EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); 0.33334f, 0.001);
EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
EXPECT_EQ(float(-float16(512.0f)), -512.0f); EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
EXPECT_EQ(float(-float16(-512.0f)), 512.0f); 0.001);
EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
} }
TEST(float16, comparison_cpu) { TEST(float16, comparison_cpu) {
......
...@@ -36,19 +36,19 @@ limitations under the License. */ ...@@ -36,19 +36,19 @@ limitations under the License. */
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, size); \ cudaMalloc(reinterpret_cast<void**>(&d_out), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (half*)malloc(size); \ out = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(out[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
free(out); \ free(out); \
...@@ -63,17 +63,17 @@ limitations under the License. */ ...@@ -63,17 +63,17 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \ op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(in1[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
cudaFree(d_in1); \ cudaFree(d_in1); \
...@@ -87,12 +87,12 @@ limitations under the License. */ ...@@ -87,12 +87,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, 1); \ cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (bool*)malloc(1); \ out = reinterpret_cast<bool*>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { ...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc((void**)&d_in, size); cudaMalloc(reinterpret_cast<void**>(&d_in), size);
in = (half*)malloc(size); in = reinterpret_cast<half*>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
cudaFree(d_in); cudaFree(d_in);
} }
......
...@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle.
[](ParallelExecutor &self, size_t num_threads, bool use_event, [](ParallelExecutor &self, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, bool allow_op_delay) { Scope *scope, std::vector<Scope *> &local_scopes,
new (&self) ParallelExecutor(num_threads, use_event, places, bool allow_op_delay) {
params, startup_program, main_program, new (&self)
loss_var_name, scope, allow_op_delay); ParallelExecutor(num_threads, use_event, places, params,
bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay);
}) })
.def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("run", &ParallelExecutor::Run); .def("run", &ParallelExecutor::Run);
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
......
*pyc *pyc
build build
dist dist
paddlepaddle.egg-info
paddle.egg-info paddle.egg-info
paddlepaddle_gpu.egg-info paddlepaddle_gpu.egg-info
.idea .idea
......
...@@ -659,7 +659,7 @@ class Block(object): ...@@ -659,7 +659,7 @@ class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = dict() # var_name --> var self.vars = dict() # var_name --> var
self.ops = collections.deque() # operator list self.ops = list() # operator list
self.program = program self.program = program
self.removed_vars = dict() self.removed_vars = dict()
...@@ -831,6 +831,13 @@ class Block(object): ...@@ -831,6 +831,13 @@ class Block(object):
self.ops.append(op) self.ops.append(op)
return op return op
def insert_op(self, index, *args, **kwargs):
self.sync_with_cpp()
op_desc = self.desc.insert_op(index)
op = Operator(block=self, desc=op_desc, *args, **kwargs)
self.ops.insert(index, op)
return op
def delete_ops(self, ops): def delete_ops(self, ops):
# remove from cpp # remove from cpp
# FIXME(typhoonzero): remove only the first occurrence. # FIXME(typhoonzero): remove only the first occurrence.
...@@ -842,12 +849,12 @@ class Block(object): ...@@ -842,12 +849,12 @@ class Block(object):
self.desc.remove_op(start, end + 1) self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end): def slice_ops(self, start, end):
return list(self.ops)[start:end] return self.ops[start:end]
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
self.ops.appendleft(op) self.ops.insert(0, op)
return op return op
def sync_with_cpp(self): def sync_with_cpp(self):
...@@ -892,7 +899,7 @@ class Block(object): ...@@ -892,7 +899,7 @@ class Block(object):
for index in range((start_index - 1 - 1), -1, -1): for index in range((start_index - 1 - 1), -1, -1):
op_desc = ops_in_cpp[index] op_desc = ops_in_cpp[index]
op = Operator(self, op_desc) op = Operator(self, op_desc)
self.ops.appendleft(op) self.ops.insert(0, op)
# sync ops append to the end of cpp_ops # sync ops append to the end of cpp_ops
for index in range((end_index + 1), len(ops_in_cpp)): for index in range((end_index + 1), len(ops_in_cpp)):
......
...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] ...@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object): class ParallelExecutor(object):
def __init__(self, def __init__(self,
loss_name,
use_cuda, use_cuda,
loss_name=None,
main_program=None,
num_threads=None, num_threads=None,
allow_op_delay=False): allow_op_delay=False,
share_vars_from=None):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda(bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training.
main_program(Program, default None): The program that need to run,
if not provided, then default_main_program will be used.
num_threads(int, default None): How many threads are used for
training.
allow_op_delay(bool, default False): Whether to delay and buffer
some operators together for scheduling or not, which may
improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor.
Returns:
A ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor
object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
"""
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
...@@ -50,10 +89,21 @@ class ParallelExecutor(object): ...@@ -50,10 +89,21 @@ class ParallelExecutor(object):
else: else:
min(len(self._places) * 2, multiprocessing.cpu_count()) min(len(self._places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program() main = main_program
main = framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, main.list_vars())
]
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
num_threads, num_threads,
True if use_cuda else False, # use_event True if use_cuda else False, # use_event
...@@ -62,10 +112,11 @@ class ParallelExecutor(object): ...@@ -62,10 +112,11 @@ class ParallelExecutor(object):
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
startup.desc, set(persistable_vars),
main.desc, main.desc,
loss_name, loss_name if loss_name else '',
scope, scope,
local_scopes,
allow_op_delay) allow_op_delay)
self.scope = scope self.scope = scope
......
...@@ -535,9 +535,37 @@ class TestSwish(OpTest): ...@@ -535,9 +535,37 @@ class TestSwish(OpTest):
#--------------------test MKLDNN-------------------- #--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu): class TestMKLDNNReluDim2(TestRelu):
def setUp(self): def setUp(self):
super(TestMKLDNNRelu, self).setUp() super(TestMKLDNNReluDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim2(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim2(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim2(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNReluDim4(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu): ...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh): class TestMKLDNNTanhDim4(TestTanh):
def setUp(self): def setUp(self):
super(TestMKLDNNTanh, self).setUp() super(TestMKLDNNTanhDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh): ...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt): class TestMKLDNNSqrtDim4(TestSqrt):
def setUp(self): def setUp(self):
super(TestMKLDNNSqrt, self).setUp() super(TestMKLDNNSqrtDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt): ...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs): class TestMKLDNNAbsDim4(TestAbs):
def setUp(self): def setUp(self):
super(TestMKLDNNAbs, self).setUp() super(TestMKLDNNAbsDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
......
...@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase):
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) place = fluid.CUDAPlace(0)
startup_exe = fluid.Executor(place)
startup_exe.run(startup)
exe = fluid.ParallelExecutor(True, loss_name=loss.name)
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count() batch_size *= fluid.core.get_cuda_device_count()
begin = time.time() begin = time.time()
...@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
@unittest.skip("transformer is buggy in multi gpu") @unittest.skip("transformer is buggy in multi gpu")
def test_main(self): def test_main(self):
self.check_network_convergence(transformer) self.check_network_convergence(transformer)
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.0001)
opt.minimize(loss)
batch_size = 32
image = numpy.random.normal(size=(batch_size,
784)).astype('float32')
label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
test_loss = numpy.array(test_loss)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
train_loss = numpy.array(train_loss)
self.assertTrue(numpy.allclose(train_loss, test_loss))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册