From 1b45847e470d4c0cc26ee969e5191431a68b7736 Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Thu, 23 Apr 2020 10:47:08 +0200
Subject: [PATCH] Add user local data preprocess support (#23692) (#24075)

---
 .../fluid/inference/tests/api/CMakeLists.txt  | 22 ++++-
 .../api/full_ILSVRC2012_val_preprocess.py     | 88 ++++++++++++++++++-
 .../api/full_pascalvoc_test_preprocess.py     | 29 +++---
 3 files changed, 124 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 13c48fcf576..ab11c7d8215 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -93,6 +93,13 @@ function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_
              --iterations=2)
 endfunction()
 
+function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
+	py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
+	        ARGS --data_dir=${data_dir}
+		     --output_file=${output_file}
+		     --local)
+endfunction()
+
 if(NOT APPLE AND WITH_MKLML)
     # RNN1
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@@ -286,8 +293,6 @@ if(WITH_MKLDNN)
   # download dataset if necessary
   download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
 
-  # download small demo set of pascalvoc for testing local userdata preprocessing
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
 
   # build test binary to be used in subsequent tests
   inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
@@ -320,6 +325,19 @@ if(WITH_MKLDNN)
   set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc")
   inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC})
   inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP})
+
+  # preprocess data2bin imagenet
+    download_int8_data(${INT8_DATA_DIR} "imagenet_small.tar.gz")
+    set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small")
+    set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin")
+    preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE})
+    
+  # preprocess data2bin pascalvoc
+  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
+  set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small")
+  set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin")
+  preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE})
+
 endif()
 
 # bert, max_len=20, embedding_dim=128
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index 826c45311f4..c5610961d65 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -24,6 +24,7 @@ import math
 from paddle.dataset.common import download
 import tarfile
 import StringIO
+import argparse
 
 random.seed(0)
 np.random.seed(0)
@@ -131,7 +132,7 @@ def check_integrity(filename, target_hash):
         return False
 
 
-def convert(tar_file, output_file):
+def convert_Imagenet_tar2bin(tar_file, output_file):
     print('Converting 50000 images to binary file ...\n')
     tar = tarfile.open(name=tar_file, mode='r:gz')
 
@@ -205,9 +206,90 @@ def run_convert():
                 "Can not convert the dataset to binary file with try limit {0}".
                 format(try_limit))
         download_concat(cache_folder, zip_path)
-        convert(zip_path, output_file)
+        convert_Imagenet_tar2bin(zip_path, output_file)
     print("\nSuccess! The binary file can be found at {0}".format(output_file))
 
 
+def convert_Imagenet_local2bin(args):
+    data_dir = args.data_dir
+    label_list_path = os.path.join(args.data_dir, args.label_list)
+    bin_file_path = os.path.join(args.data_dir, args.output_file)
+    assert data_dir, 'Once set --local, user need to provide the --data_dir'
+    with open(label_list_path) as flist:
+        lines = [line.strip() for line in flist]
+        num_images = len(lines)
+
+        with open(bin_file_path, "w+b") as of:
+            of.seek(0)
+            num = np.array(int(num_images)).astype('int64')
+            of.write(num.tobytes())
+            for idx, line in enumerate(lines):
+                img_path, label = line.split()
+                img_path = os.path.join(data_dir, img_path)
+                if not os.path.exists(img_path):
+                    continue
+
+                #save image(float32) to file
+                img = Image.open(img_path)
+                img = process_image(img)
+                np_img = np.array(img)
+                of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
+                        idx)
+                of.write(np_img.astype('float32').tobytes())
+
+                #save label(int64_t) to file
+                label_int = (int)(label)
+                np_label = np.array(label_int)
+                of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
+                        num_images + idx * SIZE_INT64)
+                of.write(np_label.astype('int64').tobytes())
+
+        # The bin file should contain
+        # number of images + all images data + all corresponding labels
+        # so the file target_size should be as follows
+        target_size = SIZE_INT64 + num_images * 3 * args.data_dim * args.data_dim * SIZE_FLOAT32 + num_images * SIZE_INT64
+        if (os.path.getsize(bin_file_path) == target_size):
+            print(
+                "Success! The user data output binary file can be found at: {0}".
+                format(bin_file_path))
+        else:
+            print("Conversion failed!")
+
+
+def main_preprocess_Imagenet(args):
+    parser = argparse.ArgumentParser(
+        description="Convert the full Imagenet val set or local data to binary file.",
+        usage=None,
+        add_help=True)
+    parser.add_argument(
+        '--local',
+        action="store_true",
+        help="If used, user need to set --data_dir and then convert file")
+    parser.add_argument(
+        "--data_dir", default="", type=str, help="Dataset root directory")
+    parser.add_argument(
+        "--label_list",
+        type=str,
+        default="val_list.txt",
+        help="List of object labels with same sequence as denoted in the annotation file"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default="imagenet_small.bin",
+        help="File path of the output binary file")
+    parser.add_argument(
+        "--data_dim",
+        type=int,
+        default=DATA_DIM,
+        help="Image preprocess with data_dim width and height")
+
+    args = parser.parse_args()
+    if args.local:
+        convert_Imagenet_local2bin(args)
+    else:
+        run_convert()
+
+
 if __name__ == '__main__':
-    run_convert()
+    main_preprocess_Imagenet(sys.argv)
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
index 4c0df6d9e57..8a098aa1eb4 100644
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -28,6 +28,8 @@ DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.t
 DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/")
 TAR_FILE = "VOCtest_06-Nov-2007.tar"
 TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)
+SIZE_FLOAT32 = 4
+SIZE_INT64 = 8
 RESIZE_H = 300
 RESIZE_W = 300
 MEAN_VALUE = [127.5, 127.5, 127.5]
@@ -60,6 +62,7 @@ def preprocess(img):
 def convert_pascalvoc_local2bin(args):
     data_dir = os.path.expanduser(args.data_dir)
     label_fpath = os.path.join(data_dir, args.label_file)
+    assert data_dir, 'Once set --local, user need to provide the --data_dir'
     flabel = open(label_fpath)
     label_list = [line.strip() for line in flabel]
 
@@ -128,10 +131,14 @@ def convert_pascalvoc_local2bin(args):
     f1.close()
 
     object_nums_sum = sum(object_nums)
-    target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * (
-        8 + 4 * 4 + 8)
+    # The data should be contains 
+    # number of images + all images data + an array that represent object numbers of each image
+    # + labels of all objects in images + bboxes of all objects + difficulties of all objects
+    # so the target size should be as follows:
+    target_size = SIZE_INT64 + image_nums * 3 * args.resize_h * args.resize_h * SIZE_FLOAT32 + image_nums * SIZE_INT64 + object_nums_sum * (
+        SIZE_INT64 + 4 * SIZE_FLOAT32 + SIZE_INT64)
     if (os.path.getsize(output_file_path) == target_size):
-        print("Success! \nThe output binary file can be found at: ",
+        print("Success! \nThe local data output binary file can be found at: ",
               output_file_path)
     else:
         print("Conversion failed!")
@@ -223,6 +230,9 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
         if line_idx % per_percentage:
             print_processbar(line_idx / per_percentage)
 
+    # The data should be stored in binary in following sequence: 
+    # number of images->all images data->an array that represent object numbers in each image
+    # ->labels of all objects in images->bboxes of all objects->difficulties of all objects
     f1.write(np.array(object_nums).astype('uint64').tobytes())
     f1.write(np.array(lbls).astype('int64').tobytes())
     f1.write(np.array(boxes).astype('float32').tobytes())
@@ -269,12 +279,11 @@ def main_pascalvoc_preprocess(args):
         usage=None,
         add_help=True)
     parser.add_argument(
-        '--choice', choices=['local', 'VOC_test_2007'], required=True)
+        '--local',
+        action="store_true",
+        help="If used, user need to set --data_dir and then convert file")
     parser.add_argument(
-        "--data_dir",
-        default="./third_party/inference_demo/int8v2/pascalvoc_small",
-        type=str,
-        help="Dataset root directory")
+        "--data_dir", default="", type=str, help="Dataset root directory")
     parser.add_argument(
         "--img_annotation_list",
         type=str,
@@ -313,9 +322,9 @@ def main_pascalvoc_preprocess(args):
         default=AP_VERSION,
         help="Image preprocess with ap_version")
     args = parser.parse_args()
-    if args.choice == 'local':
+    if args.local:
         convert_pascalvoc_local2bin(args)
-    elif args.choice == 'VOC_test_2007':
+    else:
         run_convert()
 
 
-- 
GitLab