From 1b45847e470d4c0cc26ee969e5191431a68b7736 Mon Sep 17 00:00:00 2001 From: lidanqing Date: Thu, 23 Apr 2020 10:47:08 +0200 Subject: [PATCH] Add user local data preprocess support (#23692) (#24075) --- .../fluid/inference/tests/api/CMakeLists.txt | 22 ++++- .../api/full_ILSVRC2012_val_preprocess.py | 88 ++++++++++++++++++- .../api/full_pascalvoc_test_preprocess.py | 29 +++--- 3 files changed, 124 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 13c48fcf576..ab11c7d8215 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -93,6 +93,13 @@ function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_ --iterations=2) endfunction() +function(preprocess_data2bin_test_run target py_script_source data_dir output_file) + py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} + ARGS --data_dir=${data_dir} + --output_file=${output_file} + --local) +endfunction() + if(NOT APPLE AND WITH_MKLML) # RNN1 set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") @@ -286,8 +293,6 @@ if(WITH_MKLDNN) # download dataset if necessary download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") - # download small demo set of pascalvoc for testing local userdata preprocessing - download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") # build test binary to be used in subsequent tests inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC}) @@ -320,6 +325,19 @@ if(WITH_MKLDNN) set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc") inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC}) inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP}) + + # preprocess data2bin imagenet + download_int8_data(${INT8_DATA_DIR} "imagenet_small.tar.gz") + set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small") + set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin") + preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE}) + + # preprocess data2bin pascalvoc + download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") + set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small") + set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin") + preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE}) + endif() # bert, max_len=20, embedding_dim=128 diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py index 826c45311f4..c5610961d65 100644 --- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py @@ -24,6 +24,7 @@ import math from paddle.dataset.common import download import tarfile import StringIO +import argparse random.seed(0) np.random.seed(0) @@ -131,7 +132,7 @@ def check_integrity(filename, target_hash): return False -def convert(tar_file, output_file): +def convert_Imagenet_tar2bin(tar_file, output_file): print('Converting 50000 images to binary file ...\n') tar = tarfile.open(name=tar_file, mode='r:gz') @@ -205,9 +206,90 @@ def run_convert(): "Can not convert the dataset to binary file with try limit {0}". format(try_limit)) download_concat(cache_folder, zip_path) - convert(zip_path, output_file) + convert_Imagenet_tar2bin(zip_path, output_file) print("\nSuccess! The binary file can be found at {0}".format(output_file)) +def convert_Imagenet_local2bin(args): + data_dir = args.data_dir + label_list_path = os.path.join(args.data_dir, args.label_list) + bin_file_path = os.path.join(args.data_dir, args.output_file) + assert data_dir, 'Once set --local, user need to provide the --data_dir' + with open(label_list_path) as flist: + lines = [line.strip() for line in flist] + num_images = len(lines) + + with open(bin_file_path, "w+b") as of: + of.seek(0) + num = np.array(int(num_images)).astype('int64') + of.write(num.tobytes()) + for idx, line in enumerate(lines): + img_path, label = line.split() + img_path = os.path.join(data_dir, img_path) + if not os.path.exists(img_path): + continue + + #save image(float32) to file + img = Image.open(img_path) + img = process_image(img) + np_img = np.array(img) + of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 * + idx) + of.write(np_img.astype('float32').tobytes()) + + #save label(int64_t) to file + label_int = (int)(label) + np_label = np.array(label_int) + of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 * + num_images + idx * SIZE_INT64) + of.write(np_label.astype('int64').tobytes()) + + # The bin file should contain + # number of images + all images data + all corresponding labels + # so the file target_size should be as follows + target_size = SIZE_INT64 + num_images * 3 * args.data_dim * args.data_dim * SIZE_FLOAT32 + num_images * SIZE_INT64 + if (os.path.getsize(bin_file_path) == target_size): + print( + "Success! The user data output binary file can be found at: {0}". + format(bin_file_path)) + else: + print("Conversion failed!") + + +def main_preprocess_Imagenet(args): + parser = argparse.ArgumentParser( + description="Convert the full Imagenet val set or local data to binary file.", + usage=None, + add_help=True) + parser.add_argument( + '--local', + action="store_true", + help="If used, user need to set --data_dir and then convert file") + parser.add_argument( + "--data_dir", default="", type=str, help="Dataset root directory") + parser.add_argument( + "--label_list", + type=str, + default="val_list.txt", + help="List of object labels with same sequence as denoted in the annotation file" + ) + parser.add_argument( + "--output_file", + type=str, + default="imagenet_small.bin", + help="File path of the output binary file") + parser.add_argument( + "--data_dim", + type=int, + default=DATA_DIM, + help="Image preprocess with data_dim width and height") + + args = parser.parse_args() + if args.local: + convert_Imagenet_local2bin(args) + else: + run_convert() + + if __name__ == '__main__': - run_convert() + main_preprocess_Imagenet(sys.argv) diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py index 4c0df6d9e57..8a098aa1eb4 100644 --- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py @@ -28,6 +28,8 @@ DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.t DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/") TAR_FILE = "VOCtest_06-Nov-2007.tar" TAR_PATH = os.path.join(DATA_DIR, TAR_FILE) +SIZE_FLOAT32 = 4 +SIZE_INT64 = 8 RESIZE_H = 300 RESIZE_W = 300 MEAN_VALUE = [127.5, 127.5, 127.5] @@ -60,6 +62,7 @@ def preprocess(img): def convert_pascalvoc_local2bin(args): data_dir = os.path.expanduser(args.data_dir) label_fpath = os.path.join(data_dir, args.label_file) + assert data_dir, 'Once set --local, user need to provide the --data_dir' flabel = open(label_fpath) label_list = [line.strip() for line in flabel] @@ -128,10 +131,14 @@ def convert_pascalvoc_local2bin(args): f1.close() object_nums_sum = sum(object_nums) - target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * ( - 8 + 4 * 4 + 8) + # The data should be contains + # number of images + all images data + an array that represent object numbers of each image + # + labels of all objects in images + bboxes of all objects + difficulties of all objects + # so the target size should be as follows: + target_size = SIZE_INT64 + image_nums * 3 * args.resize_h * args.resize_h * SIZE_FLOAT32 + image_nums * SIZE_INT64 + object_nums_sum * ( + SIZE_INT64 + 4 * SIZE_FLOAT32 + SIZE_INT64) if (os.path.getsize(output_file_path) == target_size): - print("Success! \nThe output binary file can be found at: ", + print("Success! \nThe local data output binary file can be found at: ", output_file_path) else: print("Conversion failed!") @@ -223,6 +230,9 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path): if line_idx % per_percentage: print_processbar(line_idx / per_percentage) + # The data should be stored in binary in following sequence: + # number of images->all images data->an array that represent object numbers in each image + # ->labels of all objects in images->bboxes of all objects->difficulties of all objects f1.write(np.array(object_nums).astype('uint64').tobytes()) f1.write(np.array(lbls).astype('int64').tobytes()) f1.write(np.array(boxes).astype('float32').tobytes()) @@ -269,12 +279,11 @@ def main_pascalvoc_preprocess(args): usage=None, add_help=True) parser.add_argument( - '--choice', choices=['local', 'VOC_test_2007'], required=True) + '--local', + action="store_true", + help="If used, user need to set --data_dir and then convert file") parser.add_argument( - "--data_dir", - default="./third_party/inference_demo/int8v2/pascalvoc_small", - type=str, - help="Dataset root directory") + "--data_dir", default="", type=str, help="Dataset root directory") parser.add_argument( "--img_annotation_list", type=str, @@ -313,9 +322,9 @@ def main_pascalvoc_preprocess(args): default=AP_VERSION, help="Image preprocess with ap_version") args = parser.parse_args() - if args.choice == 'local': + if args.local: convert_pascalvoc_local2bin(args) - elif args.choice == 'VOC_test_2007': + else: run_convert() -- GitLab