未验证 提交 2291634c 编写于 作者: L lidanqing 提交者: GitHub

Add user local data preprocess support (#23692)

* add local data preprocess support for imagenet
test=develop

* add local data2bin tests
test=develop

* locally two tests passed
test=develop

* change according to reviews
test=develop
上级 cca5f8fa
...@@ -93,6 +93,13 @@ function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_ ...@@ -93,6 +93,13 @@ function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_
--iterations=2) --iterations=2)
endfunction() endfunction()
function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
ARGS --data_dir=${data_dir}
--output_file=${output_file}
--local)
endfunction()
if(NOT APPLE AND WITH_MKLML) if(NOT APPLE AND WITH_MKLML)
# RNN1 # RNN1
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
...@@ -286,8 +293,6 @@ if(WITH_MKLDNN) ...@@ -286,8 +293,6 @@ if(WITH_MKLDNN)
# download dataset if necessary # download dataset if necessary
download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
# download small demo set of pascalvoc for testing local userdata preprocessing
download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
# build test binary to be used in subsequent tests # build test binary to be used in subsequent tests
inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC}) inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
...@@ -320,6 +325,19 @@ if(WITH_MKLDNN) ...@@ -320,6 +325,19 @@ if(WITH_MKLDNN)
set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc") set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc")
inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC}) inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC})
inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP}) inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP})
# preprocess data2bin imagenet
download_int8_data(${INT8_DATA_DIR} "imagenet_small.tar.gz")
set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small")
set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin")
preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE})
# preprocess data2bin pascalvoc
download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small")
set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin")
preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE})
endif() endif()
# bert, max_len=20, embedding_dim=128 # bert, max_len=20, embedding_dim=128
......
...@@ -24,6 +24,7 @@ import math ...@@ -24,6 +24,7 @@ import math
from paddle.dataset.common import download from paddle.dataset.common import download
import tarfile import tarfile
import StringIO import StringIO
import argparse
random.seed(0) random.seed(0)
np.random.seed(0) np.random.seed(0)
...@@ -131,7 +132,7 @@ def check_integrity(filename, target_hash): ...@@ -131,7 +132,7 @@ def check_integrity(filename, target_hash):
return False return False
def convert(tar_file, output_file): def convert_Imagenet_tar2bin(tar_file, output_file):
print('Converting 50000 images to binary file ...\n') print('Converting 50000 images to binary file ...\n')
tar = tarfile.open(name=tar_file, mode='r:gz') tar = tarfile.open(name=tar_file, mode='r:gz')
...@@ -205,9 +206,90 @@ def run_convert(): ...@@ -205,9 +206,90 @@ def run_convert():
"Can not convert the dataset to binary file with try limit {0}". "Can not convert the dataset to binary file with try limit {0}".
format(try_limit)) format(try_limit))
download_concat(cache_folder, zip_path) download_concat(cache_folder, zip_path)
convert(zip_path, output_file) convert_Imagenet_tar2bin(zip_path, output_file)
print("\nSuccess! The binary file can be found at {0}".format(output_file)) print("\nSuccess! The binary file can be found at {0}".format(output_file))
def convert_Imagenet_local2bin(args):
data_dir = args.data_dir
label_list_path = os.path.join(args.data_dir, args.label_list)
bin_file_path = os.path.join(args.data_dir, args.output_file)
assert data_dir, 'Once set --local, user need to provide the --data_dir'
with open(label_list_path) as flist:
lines = [line.strip() for line in flist]
num_images = len(lines)
with open(bin_file_path, "w+b") as of:
of.seek(0)
num = np.array(int(num_images)).astype('int64')
of.write(num.tobytes())
for idx, line in enumerate(lines):
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
if not os.path.exists(img_path):
continue
#save image(float32) to file
img = Image.open(img_path)
img = process_image(img)
np_img = np.array(img)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
idx)
of.write(np_img.astype('float32').tobytes())
#save label(int64_t) to file
label_int = (int)(label)
np_label = np.array(label_int)
of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
num_images + idx * SIZE_INT64)
of.write(np_label.astype('int64').tobytes())
# The bin file should contain
# number of images + all images data + all corresponding labels
# so the file target_size should be as follows
target_size = SIZE_INT64 + num_images * 3 * args.data_dim * args.data_dim * SIZE_FLOAT32 + num_images * SIZE_INT64
if (os.path.getsize(bin_file_path) == target_size):
print(
"Success! The user data output binary file can be found at: {0}".
format(bin_file_path))
else:
print("Conversion failed!")
def main_preprocess_Imagenet(args):
parser = argparse.ArgumentParser(
description="Convert the full Imagenet val set or local data to binary file.",
usage=None,
add_help=True)
parser.add_argument(
'--local',
action="store_true",
help="If used, user need to set --data_dir and then convert file")
parser.add_argument(
"--data_dir", default="", type=str, help="Dataset root directory")
parser.add_argument(
"--label_list",
type=str,
default="val_list.txt",
help="List of object labels with same sequence as denoted in the annotation file"
)
parser.add_argument(
"--output_file",
type=str,
default="imagenet_small.bin",
help="File path of the output binary file")
parser.add_argument(
"--data_dim",
type=int,
default=DATA_DIM,
help="Image preprocess with data_dim width and height")
args = parser.parse_args()
if args.local:
convert_Imagenet_local2bin(args)
else:
run_convert()
if __name__ == '__main__': if __name__ == '__main__':
run_convert() main_preprocess_Imagenet(sys.argv)
...@@ -28,6 +28,8 @@ DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.t ...@@ -28,6 +28,8 @@ DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.t
DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/") DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/")
TAR_FILE = "VOCtest_06-Nov-2007.tar" TAR_FILE = "VOCtest_06-Nov-2007.tar"
TAR_PATH = os.path.join(DATA_DIR, TAR_FILE) TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)
SIZE_FLOAT32 = 4
SIZE_INT64 = 8
RESIZE_H = 300 RESIZE_H = 300
RESIZE_W = 300 RESIZE_W = 300
MEAN_VALUE = [127.5, 127.5, 127.5] MEAN_VALUE = [127.5, 127.5, 127.5]
...@@ -60,6 +62,7 @@ def preprocess(img): ...@@ -60,6 +62,7 @@ def preprocess(img):
def convert_pascalvoc_local2bin(args): def convert_pascalvoc_local2bin(args):
data_dir = os.path.expanduser(args.data_dir) data_dir = os.path.expanduser(args.data_dir)
label_fpath = os.path.join(data_dir, args.label_file) label_fpath = os.path.join(data_dir, args.label_file)
assert data_dir, 'Once set --local, user need to provide the --data_dir'
flabel = open(label_fpath) flabel = open(label_fpath)
label_list = [line.strip() for line in flabel] label_list = [line.strip() for line in flabel]
...@@ -128,10 +131,14 @@ def convert_pascalvoc_local2bin(args): ...@@ -128,10 +131,14 @@ def convert_pascalvoc_local2bin(args):
f1.close() f1.close()
object_nums_sum = sum(object_nums) object_nums_sum = sum(object_nums)
target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * ( # The data should be contains
8 + 4 * 4 + 8) # number of images + all images data + an array that represent object numbers of each image
# + labels of all objects in images + bboxes of all objects + difficulties of all objects
# so the target size should be as follows:
target_size = SIZE_INT64 + image_nums * 3 * args.resize_h * args.resize_h * SIZE_FLOAT32 + image_nums * SIZE_INT64 + object_nums_sum * (
SIZE_INT64 + 4 * SIZE_FLOAT32 + SIZE_INT64)
if (os.path.getsize(output_file_path) == target_size): if (os.path.getsize(output_file_path) == target_size):
print("Success! \nThe output binary file can be found at: ", print("Success! \nThe local data output binary file can be found at: ",
output_file_path) output_file_path)
else: else:
print("Conversion failed!") print("Conversion failed!")
...@@ -223,6 +230,9 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path): ...@@ -223,6 +230,9 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
if line_idx % per_percentage: if line_idx % per_percentage:
print_processbar(line_idx / per_percentage) print_processbar(line_idx / per_percentage)
# The data should be stored in binary in following sequence:
# number of images->all images data->an array that represent object numbers in each image
# ->labels of all objects in images->bboxes of all objects->difficulties of all objects
f1.write(np.array(object_nums).astype('uint64').tobytes()) f1.write(np.array(object_nums).astype('uint64').tobytes())
f1.write(np.array(lbls).astype('int64').tobytes()) f1.write(np.array(lbls).astype('int64').tobytes())
f1.write(np.array(boxes).astype('float32').tobytes()) f1.write(np.array(boxes).astype('float32').tobytes())
...@@ -269,12 +279,11 @@ def main_pascalvoc_preprocess(args): ...@@ -269,12 +279,11 @@ def main_pascalvoc_preprocess(args):
usage=None, usage=None,
add_help=True) add_help=True)
parser.add_argument( parser.add_argument(
'--choice', choices=['local', 'VOC_test_2007'], required=True) '--local',
action="store_true",
help="If used, user need to set --data_dir and then convert file")
parser.add_argument( parser.add_argument(
"--data_dir", "--data_dir", default="", type=str, help="Dataset root directory")
default="./third_party/inference_demo/int8v2/pascalvoc_small",
type=str,
help="Dataset root directory")
parser.add_argument( parser.add_argument(
"--img_annotation_list", "--img_annotation_list",
type=str, type=str,
...@@ -313,9 +322,9 @@ def main_pascalvoc_preprocess(args): ...@@ -313,9 +322,9 @@ def main_pascalvoc_preprocess(args):
default=AP_VERSION, default=AP_VERSION,
help="Image preprocess with ap_version") help="Image preprocess with ap_version")
args = parser.parse_args() args = parser.parse_args()
if args.choice == 'local': if args.local:
convert_pascalvoc_local2bin(args) convert_pascalvoc_local2bin(args)
elif args.choice == 'VOC_test_2007': else:
run_convert() run_convert()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册