提交 f36588dc 编写于 作者: M Michał Gallus 提交者: Tao Luo

Add MKL-DNN benchmarking for chinese_ner (#1048)

* Add MKL-DNN Benchmarking to CRNN-CTC

* Add MKL-DNN benchmarking for chinese_ner

* Make crnn-ctc scripts more portable

* Merge CRNN-CTC train & inference scripts

* Remove LD_LIBRARY_PATH from crnn-ctc scripts

* CRNN-CTC scripts: set parallel to true

Abort script if batch_size is lower than num of cores

* CRNN-CTC scripts: limit mode options in infer

* CRNN-CTC scripts: set mkldnn parallel to False

* CRNN-CTC scripts: remove mkldnn parallel warning

* Chinese-ner: Merge train & infer scripts, update readme

* Chinese_ner: add --parallel flag for train
上级 dc731b0b
import numpy as np
import argparse
import time
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle
import reader
def parse_args():
parser = argparse.ArgumentParser("Run inference.")
parser.add_argument(
'--batch_size',
type=int,
default=6,
help='The size of a batch. (default: %(default)d)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--model_path',
type=str,
default='model/params_pass_0',
help='A path to the model. (default: %(default)s)')
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_files',
help='A directory with test data files. (default: %(default)s)')
parser.add_argument(
'--test_label_file',
type=str,
default='data/label_dict',
help='A file with test labels. (default: %(default)s)')
parser.add_argument(
'--num_passes', type=int, default=1, help='The number of passes.')
parser.add_argument(
'--skip_pass_num',
type=int,
default=0,
help='The first num of passes to skip in statistics calculations.')
parser.add_argument(
'--profile', action='store_true', help='If set, do profiling.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
def infer(model_path, batch_size, test_data_file, target_file):
def infer(args):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mention = fluid.layers.data(
name='mention', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
label_reverse_dict = load_reverse_dict(target_file)
label_reverse_dict = load_reverse_dict(args.test_label_file)
test_data = paddle.batch(
reader.file_reader(test_data_file), batch_size=batch_size)
place = fluid.CPUPlace()
reader.file_reader(args.test_data_dir), batch_size=args.batch_size)
place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mention, target], place=place)
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
for data in test_data():
crf_decode = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=fetch_targets,
return_numpy=False)
lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0])
assert len(data) == len(lod_info) - 1
for sen_index in xrange(len(data)):
assert len(data[sen_index][0]) == lod_info[
sen_index + 1] - lod_info[sen_index]
word_index = 0
for tag_index in xrange(lod_info[sen_index],
lod_info[sen_index + 1]):
word = str(data[sen_index][0][word_index])
gold_tag = label_reverse_dict[data[sen_index][2][
word_index]]
tag = label_reverse_dict[np_data[tag_index][0]]
print word + "\t" + gold_tag + "\t" + tag
word_index += 1
print ""
fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)
total_passes = args.num_passes + args.skip_pass_num
batch_times = [0] * total_passes
word_counts = [0] * total_passes
wpses = [0] * total_passes
all_iters = 0
for pass_id in range(total_passes):
if pass_id < args.skip_pass_num:
print("Warm-up pass")
if pass_id == args.skip_pass_num:
profiler.reset_profiler()
iters = 0
for data in test_data():
start = time.time()
crf_decode = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=fetch_targets,
return_numpy=False)
batch_time = time.time() - start
lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0])
word_count = 0
assert len(data) == len(lod_info) - 1
for sen_index in xrange(len(data)):
assert len(data[sen_index][0]) == lod_info[
sen_index + 1] - lod_info[sen_index]
word_index = 0
for tag_index in xrange(lod_info[sen_index],
lod_info[sen_index + 1]):
word = str(data[sen_index][0][word_index])
gold_tag = label_reverse_dict[data[sen_index][2][
word_index]]
tag = label_reverse_dict[np_data[tag_index][0]]
word_index += 1
word_count += word_index
batch_times[pass_id] += batch_time
word_counts[pass_id] += word_count
iters += 1
all_iters += 1
batch_times[pass_id] /= iters
word_counts[pass_id] /= iters
wps = word_counts[pass_id] / batch_times[pass_id]
wpses[pass_id] = wps
print(
"Pass: %d, iterations (total): %d (%d), latency: %.5f s, words: %d, wps: %f"
% (pass_id, iters, all_iters, batch_times[pass_id],
word_counts[pass_id], wps))
# Postprocess benchmark data
latencies = batch_times[args.skip_pass_num:]
latency_avg = np.average(latencies)
latency_std = np.std(latencies)
latency_pc99 = np.percentile(latencies, 99)
wps_avg = np.average(wpses)
wps_std = np.std(wpses)
wps_pc01 = np.percentile(wpses, 1)
# Benchmark output
print('\nTotal passes (incl. warm-up): %d' % (total_passes))
print('Total iterations (incl. warm-up): %d' % (all_iters))
print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size))
print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' %
(latency_avg, latency_std, latency_pc99))
print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' %
(wps_avg, wps_std, wps_pc01))
if __name__ == "__main__":
infer(
model_path="output/params_pass_0",
batch_size=6,
test_data_file="data/test_files",
target_file="data/label_dict")
args = parse_args()
print_arguments(args)
if args.profile:
if args.device == 'GPU':
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
infer(args)
else:
with profiler.profiler('CPU', sorted_key='total') as cpuprof:
infer(args)
else:
infer(args)
## Purpose of this directory
The purpose of this directory is to provide exemplary execution commands. They are inside bash scripts described below.
## Preparation
To add execution permissions for shell scripts, run in this directory:
`chmod +x *.sh`
## Performance tips
Use the below environment flags for best performance:
```
KMP_AFFINITY=granularity=fine,compact,1,0
OMP_NUM_THREADS=<num_of_physical_cores>
```
For example, you can export them, or add them inside the specific files.
## Training
### CPU with mkldnn
Run:
`./train.sh MKLDNN`
### CPU without mkldnn
Run:
`./train.sh CPU`
### GPU
Run:
`./train.sh GPU`
## Inference
### CPU with mkldnn
Run:
`./infer.sh MKLDNN`
### CPU without mkldnn
Run:
`./infer.sh CPU`
### GPU
Run:
`./infer.sh GPU`
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
mode=$1 # gpu, cpu, mkldnn
if [ "$mode" = "CPU" ]; then
device="CPU"
model_path="cpu_model"
elif [ "$mode" = "GPU" ]; then
device="GPU"
model_path="gpu_model"
elif [ "$mode" = "MKLDNN" ]; then
device="CPU"
model_path="mkldnn_model"
export FLAGS_use_mkldnn=1
else
echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
exit 1
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
fi
python ../infer.py \
--device $device \
--num_passes 1 \
--skip_pass_num 2 \
--profile \
--test_data_dir ../data/test_files \
--test_label_file ../data/label_dict \
--model_path $model_path/params_pass_0
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
mode=$1 # gpu, cpu, mkldnn
if [ "$mode" = "CPU" ]; then
device="CPU"
parallel="--parallel True"
save_model_dir="cpu_model"
elif [ "$mode" = "GPU" ]; then
device="GPU"
parallel="--parallel True"
save_model_dir="gpu_model"
elif [ "$mode" = "MKLDNN" ]; then
device="CPU"
parallel=""
save_model_dir="mkldnn_model"
export FLAGS_use_mkldnn=1
else
echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
exit 1
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
fi
python ../train.py \
--device $device \
$parallel \
--model_save_dir $save_model_dir \
--test_data_dir ../data/test_files \
--train_data_dir ../data/train_files \
--num_passes 1
import os
import math
import time
import argparse
import numpy as np
import paddle
......@@ -10,6 +11,65 @@ from paddle.fluid.initializer import NormalInitializer
import reader
def parse_args():
parser = argparse.ArgumentParser("Run inference.")
parser.add_argument(
'--batch_size',
type=int,
default=256,
help='The size of a batch. (default: %(default)d)')
parser.add_argument(
'--word_dict_len',
type=int,
default=1942563,
help='The lenght of the word dictionary. (default: %(default)d)')
parser.add_argument(
'--label_dict_len',
type=int,
default=49,
help='The lenght of the label dictionary. (default: %(default)d)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--train_data_dir',
type=str,
default='data/train_files',
help='A directory with train data files. (default: %(default)s)')
parser.add_argument(
'--parallel',
type=bool,
default=False,
help="Whether to use parallel training. (default: %(default)s)")
parser.add_argument(
'--test_data_dir',
type=str,
default='data/test_files',
help='A directory with test data files. (default: %(default)s)')
parser.add_argument(
'--model_save_dir',
type=str,
default='./output',
help='A directory for saving models. (default: %(default)s)')
parser.add_argument(
'--num_passes',
type=int,
default=1000,
help='The number of epochs. (default: %(default)d)')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
......@@ -197,32 +257,27 @@ def test(test_exe, chunk_evaluator, inference_program, test_data, place,
return chunk_evaluator.eval()
def main(train_data_file, test_data_file, model_save_dir, num_passes):
if not os.path.exists(model_save_dir):
os.mkdir(model_save_dir)
BATCH_SIZE = 256
word_dict_len = 1942563
label_dict_len = 49
def main(args):
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
avg_cost, feature_out, word, mention, target = ner_net(word_dict_len,
label_dict_len)
avg_cost, feature_out, word, mention, target = ner_net(
args.word_dict_len, args.label_dict_len)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
sgd_optimizer.minimize(avg_cost)
crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
sgd_optimizer.minimize(avg_cost)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
input=crf_decode,
label=target,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
num_chunk_types=int(math.ceil((args.label_dict_len - 1) / 2.0)))
chunk_evaluator = fluid.metrics.ChunkEvaluator()
......@@ -233,28 +288,33 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes):
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.file_reader(train_data_file), buf_size=2000000),
batch_size=BATCH_SIZE)
reader.file_reader(args.train_data_dir), buf_size=2000000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.file_reader(test_data_file), buf_size=2000000),
batch_size=BATCH_SIZE)
reader.file_reader(args.test_data_dir), buf_size=2000000),
batch_size=args.batch_size)
place = fluid.CUDAPlace(0)
place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
feeder = fluid.DataFeeder(
feed_list=[word, mention, target], place=place)
exe = fluid.Executor(place)
exe.run(startup)
train_exe = fluid.ParallelExecutor(
loss_name=avg_cost.name, use_cuda=True)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=inference_program,
share_vars_from=train_exe)
if args.parallel:
train_exe = fluid.ParallelExecutor(
loss_name=avg_cost.name, use_cuda=(args.device == 'GPU'))
test_exe = fluid.ParallelExecutor(
use_cuda=(args.device == 'GPU'),
main_program=inference_program,
share_vars_from=train_exe)
else:
train_exe = exe
test_exe = exe
batch_id = 0
for pass_id in xrange(num_passes):
for pass_id in xrange(args.num_passes):
chunk_evaluator.reset()
train_reader_iter = train_reader()
start_time = time.time()
......@@ -286,15 +346,13 @@ def main(train_data_file, test_data_file, model_save_dir, num_passes):
[num_infer_chunks, num_label_chunks, num_correct_chunks])
print("[Test] precision:" + str(p) + ", recall:" + str(r) + ", f1:"
+ str(f1))
save_dirname = os.path.join(model_save_dir,
save_dirname = os.path.join(args.model_save_dir,
"params_pass_%d" % pass_id)
fluid.io.save_inference_model(save_dirname, ['word', 'mention'],
[crf_decode], exe)
fluid.io.save_inference_model(
save_dirname, ['word', 'mention', 'target'], [crf_decode], exe)
if __name__ == "__main__":
main(
train_data_file="./data/train_files",
test_data_file="./data/test_files",
model_save_dir="./output",
num_passes=1000)
args = parse_args()
print_arguments(args)
main(args)
......@@ -18,10 +18,6 @@ elif [ "$mode" = "GPU" ]; then
save_model_dir="gpu_model"
parallel="True"
elif [ "$mode" = "MKLDNN" ]; then
if [ $core_num -gt $batch_size ]; then
echo "Batch size should be greater or equal to the number of
available cores, when parallel mode is set to True."
fi
use_gpu="False"
save_model_dir="mkldnn_model"
parallel="False"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册