提交 f36588dc 编写于 作者: M Michał Gallus 提交者: Tao Luo

Add MKL-DNN benchmarking for chinese_ner (#1048)

* Add MKL-DNN Benchmarking to CRNN-CTC

* Add MKL-DNN benchmarking for chinese_ner

* Make crnn-ctc scripts more portable

* Merge CRNN-CTC train & inference scripts

* Remove LD_LIBRARY_PATH from crnn-ctc scripts

* CRNN-CTC scripts: set parallel to true

Abort script if batch_size is lower than num of cores

* CRNN-CTC scripts: limit mode options in infer

* CRNN-CTC scripts: set mkldnn parallel to False

* CRNN-CTC scripts: remove mkldnn parallel warning

* Chinese-ner: Merge train & infer scripts, update readme

* Chinese_ner: add --parallel flag for train
上级 dc731b0b
import numpy as np import numpy as np
import argparse
import time
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle import paddle
import reader import reader
def parse_args():
parser = argparse.ArgumentParser("Run inference.")
parser.add_argument(
'--batch_size',
type=int,
default=6,
help='The size of a batch. (default: %(default)d)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--model_path',
type=str,
default='model/params_pass_0',
help='A path to the model. (default: %(default)s)')
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_files',
help='A directory with test data files. (default: %(default)s)')
parser.add_argument(
'--test_label_file',
type=str,
default='data/label_dict',
help='A file with test labels. (default: %(default)s)')
parser.add_argument(
'--num_passes', type=int, default=1, help='The number of passes.')
parser.add_argument(
'--skip_pass_num',
type=int,
default=0,
help='The first num of passes to skip in statistics calculations.')
parser.add_argument(
'--profile', action='store_true', help='If set, do profiling.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def load_reverse_dict(dict_path): def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0]) return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines())) for idx, line in enumerate(open(dict_path, "r").readlines()))
def infer(model_path, batch_size, test_data_file, target_file): def infer(args):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mention = fluid.layers.data( mention = fluid.layers.data(
name='mention', shape=[1], dtype='int64', lod_level=1) name='mention', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data( target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1) name='target', shape=[1], dtype='int64', lod_level=1)
label_reverse_dict = load_reverse_dict(target_file) label_reverse_dict = load_reverse_dict(args.test_label_file)
test_data = paddle.batch( test_data = paddle.batch(
reader.file_reader(test_data_file), batch_size=batch_size) reader.file_reader(args.test_data_dir), batch_size=args.batch_size)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mention, target], place=place) feeder = fluid.DataFeeder(feed_list=[word, mention, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
inference_scope = fluid.core.Scope() inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope): with fluid.scope_guard(inference_scope):
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe) fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)
total_passes = args.num_passes + args.skip_pass_num
batch_times = [0] * total_passes
word_counts = [0] * total_passes
wpses = [0] * total_passes
all_iters = 0
for pass_id in range(total_passes):
if pass_id < args.skip_pass_num:
print("Warm-up pass")
if pass_id == args.skip_pass_num:
profiler.reset_profiler()
iters = 0
for data in test_data(): for data in test_data():
start = time.time()
crf_decode = exe.run(inference_program, crf_decode = exe.run(inference_program,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=fetch_targets, fetch_list=fetch_targets,
return_numpy=False) return_numpy=False)
batch_time = time.time() - start
lod_info = (crf_decode[0].lod())[0] lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0]) np_data = np.array(crf_decode[0])
word_count = 0
assert len(data) == len(lod_info) - 1 assert len(data) == len(lod_info) - 1
for sen_index in xrange(len(data)): for sen_index in xrange(len(data)):
assert len(data[sen_index][0]) == lod_info[ assert len(data[sen_index][0]) == lod_info[
...@@ -47,14 +113,50 @@ def infer(model_path, batch_size, test_data_file, target_file): ...@@ -47,14 +113,50 @@ def infer(model_path, batch_size, test_data_file, target_file):
gold_tag = label_reverse_dict[data[sen_index][2][ gold_tag = label_reverse_dict[data[sen_index][2][
word_index]] word_index]]
tag = label_reverse_dict[np_data[tag_index][0]] tag = label_reverse_dict[np_data[tag_index][0]]
print word + "\t" + gold_tag + "\t" + tag
word_index += 1 word_index += 1
print "" word_count += word_index
batch_times[pass_id] += batch_time
word_counts[pass_id] += word_count
iters += 1
all_iters += 1
batch_times[pass_id] /= iters
word_counts[pass_id] /= iters
wps = word_counts[pass_id] / batch_times[pass_id]
wpses[pass_id] = wps
print(
"Pass: %d, iterations (total): %d (%d), latency: %.5f s, words: %d, wps: %f"
% (pass_id, iters, all_iters, batch_times[pass_id],
word_counts[pass_id], wps))
# Postprocess benchmark data
latencies = batch_times[args.skip_pass_num:]
latency_avg = np.average(latencies)
latency_std = np.std(latencies)
latency_pc99 = np.percentile(latencies, 99)
wps_avg = np.average(wpses)
wps_std = np.std(wpses)
wps_pc01 = np.percentile(wpses, 1)
# Benchmark output
print('\nTotal passes (incl. warm-up): %d' % (total_passes))
print('Total iterations (incl. warm-up): %d' % (all_iters))
print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size))
print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' %
(latency_avg, latency_std, latency_pc99))
print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' %
(wps_avg, wps_std, wps_pc01))
if __name__ == "__main__": if __name__ == "__main__":
infer( args = parse_args()
model_path="output/params_pass_0", print_arguments(args)
batch_size=6, if args.profile:
test_data_file="data/test_files", if args.device == 'GPU':
target_file="data/label_dict") with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
infer(args)
else:
with profiler.profiler('CPU', sorted_key='total') as cpuprof:
infer(args)
else:
infer(args)
## Purpose of this directory
The purpose of this directory is to provide exemplary execution commands. They are inside bash scripts described below.
## Preparation
To add execution permissions for shell scripts, run in this directory:
`chmod +x *.sh`
## Performance tips
Use the below environment flags for best performance:
```
KMP_AFFINITY=granularity=fine,compact,1,0
OMP_NUM_THREADS=<num_of_physical_cores>
```
For example, you can export them, or add them inside the specific files.
## Training
### CPU with mkldnn
Run:
`./train.sh MKLDNN`
### CPU without mkldnn
Run:
`./train.sh CPU`
### GPU
Run:
`./train.sh GPU`
## Inference
### CPU with mkldnn
Run:
`./infer.sh MKLDNN`
### CPU without mkldnn
Run:
`./infer.sh CPU`
### GPU
Run:
`./infer.sh GPU`
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
mode=$1 # gpu, cpu, mkldnn
if [ "$mode" = "CPU" ]; then
device="CPU"
model_path="cpu_model"
elif [ "$mode" = "GPU" ]; then
device="GPU"
model_path="gpu_model"
elif [ "$mode" = "MKLDNN" ]; then
device="CPU"
model_path="mkldnn_model"
export FLAGS_use_mkldnn=1
else
echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
exit 1
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
fi
python ../infer.py \
--device $device \
--num_passes 1 \
--skip_pass_num 2 \
--profile \
--test_data_dir ../data/test_files \
--test_label_file ../data/label_dict \
--model_path $model_path/params_pass_0
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
mode=$1 # gpu, cpu, mkldnn
if [ "$mode" = "CPU" ]; then
device="CPU"
parallel="--parallel True"
save_model_dir="cpu_model"
elif [ "$mode" = "GPU" ]; then
device="GPU"
parallel="--parallel True"
save_model_dir="gpu_model"
elif [ "$mode" = "MKLDNN" ]; then
device="CPU"
parallel=""
save_model_dir="mkldnn_model"
export FLAGS_use_mkldnn=1
else
echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
exit 1
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
fi
python ../train.py \
--device $device \
$parallel \
--model_save_dir $save_model_dir \
--test_data_dir ../data/test_files \
--train_data_dir ../data/train_files \
--num_passes 1
import os import os
import math import math
import time import time
import argparse
import numpy as np import numpy as np
import paddle import paddle
...@@ -10,6 +11,65 @@ from paddle.fluid.initializer import NormalInitializer ...@@ -10,6 +11,65 @@ from paddle.fluid.initializer import NormalInitializer
import reader import reader
def parse_args():
parser = argparse.ArgumentParser("Run inference.")
parser.add_argument(
'--batch_size',
type=int,
default=256,
help='The size of a batch. (default: %(default)d)')
parser.add_argument(
'--word_dict_len',
type=int,
default=1942563,
help='The lenght of the word dictionary. (default: %(default)d)')
parser.add_argument(
'--label_dict_len',
type=int,
default=49,
help='The lenght of the label dictionary. (default: %(default)d)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--train_data_dir',
type=str,
default='data/train_files',
help='A directory with train data files. (default: %(default)s)')
parser.add_argument(
'--parallel',
type=bool,
default=False,
help="Whether to use parallel training. (default: %(default)s)")
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_files',
help='A directory with test data files. (default: %(default)s)')
parser.add_argument(
'--model_save_dir',
type=str,
default='./output',
help='A directory for saving models. (default: %(default)s)')
parser.add_argument(
'--num_passes',
type=int,
default=1000,
help='The number of epochs. (default: %(default)d)')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def load_reverse_dict(dict_path): def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0]) return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines())) for idx, line in enumerate(open(dict_path, "r").readlines()))
...@@ -197,32 +257,27 @@ def test(test_exe, chunk_evaluator, inference_program, test_data, place, ...@@ -197,32 +257,27 @@ def test(test_exe, chunk_evaluator, inference_program, test_data, place,
return chunk_evaluator.eval() return chunk_evaluator.eval()
def main(train_data_file, test_data_file, model_save_dir, num_passes): def main(args):
if not os.path.exists(model_save_dir): if not os.path.exists(args.model_save_dir):
os.mkdir(model_save_dir) os.makedirs(args.model_save_dir)
BATCH_SIZE = 256
word_dict_len = 1942563
label_dict_len = 49
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
avg_cost, feature_out, word, mention, target = ner_net(word_dict_len, avg_cost, feature_out, word, mention, target = ner_net(
label_dict_len) args.word_dict_len, args.label_dict_len)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
sgd_optimizer.minimize(avg_cost)
crf_decode = fluid.layers.crf_decoding( crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
sgd_optimizer.minimize(avg_cost)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks, (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval( num_correct_chunks) = fluid.layers.chunk_eval(
input=crf_decode, input=crf_decode,
label=target, label=target,
chunk_scheme="IOB", chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) num_chunk_types=int(math.ceil((args.label_dict_len - 1) / 2.0)))
chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator = fluid.metrics.ChunkEvaluator()
...@@ -233,28 +288,33 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes): ...@@ -233,28 +288,33 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
reader.file_reader(train_data_file), buf_size=2000000), reader.file_reader(args.train_data_dir), buf_size=2000000),
batch_size=BATCH_SIZE) batch_size=args.batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
reader.file_reader(test_data_file), buf_size=2000000), reader.file_reader(args.test_data_dir), buf_size=2000000),
batch_size=BATCH_SIZE) batch_size=args.batch_size)
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[word, mention, target], place=place) feed_list=[word, mention, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
if args.parallel:
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
loss_name=avg_cost.name, use_cuda=True) loss_name=avg_cost.name, use_cuda=(args.device == 'GPU'))
test_exe = fluid.ParallelExecutor( test_exe = fluid.ParallelExecutor(
use_cuda=True, use_cuda=(args.device == 'GPU'),
main_program=inference_program, main_program=inference_program,
share_vars_from=train_exe) share_vars_from=train_exe)
else:
train_exe = exe
test_exe = exe
batch_id = 0 batch_id = 0
for pass_id in xrange(num_passes): for pass_id in xrange(args.num_passes):
chunk_evaluator.reset() chunk_evaluator.reset()
train_reader_iter = train_reader() train_reader_iter = train_reader()
start_time = time.time() start_time = time.time()
...@@ -286,15 +346,13 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes): ...@@ -286,15 +346,13 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes):
[num_infer_chunks, num_label_chunks, num_correct_chunks]) [num_infer_chunks, num_label_chunks, num_correct_chunks])
print("[Test] precision:" + str(p) + ", recall:" + str(r) + ", f1:" print("[Test] precision:" + str(p) + ", recall:" + str(r) + ", f1:"
+ str(f1)) + str(f1))
save_dirname = os.path.join(model_save_dir, save_dirname = os.path.join(args.model_save_dir,
"params_pass_%d" % pass_id) "params_pass_%d" % pass_id)
fluid.io.save_inference_model(save_dirname, ['word', 'mention'], fluid.io.save_inference_model(
[crf_decode], exe) save_dirname, ['word', 'mention', 'target'], [crf_decode], exe)
if __name__ == "__main__": if __name__ == "__main__":
main( args = parse_args()
train_data_file="./data/train_files", print_arguments(args)
test_data_file="./data/test_files", main(args)
model_save_dir="./output",
num_passes=1000)
...@@ -18,10 +18,6 @@ elif [ "$mode" = "GPU" ]; then ...@@ -18,10 +18,6 @@ elif [ "$mode" = "GPU" ]; then
save_model_dir="gpu_model" save_model_dir="gpu_model"
parallel="True" parallel="True"
elif [ "$mode" = "MKLDNN" ]; then elif [ "$mode" = "MKLDNN" ]; then
if [ $core_num -gt $batch_size ]; then
echo "Batch size should be greater or equal to the number of
available cores, when parallel mode is set to True."
fi
use_gpu="False" use_gpu="False"
save_model_dir="mkldnn_model" save_model_dir="mkldnn_model"
parallel="False" parallel="False"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册