Refine ppdet download (#3628)

* refine download sh to py * update QUICK_STARTED

Refine ppdet download (#3628)
* refine download sh to py * update QUICK_STARTED
8116ac6f · Kaipeng Deng · GitHub · db627af7 · db627af7 · 8116ac6f
11 changed file
--- a/dataset/coco/download.sh
+++ b/dataset/coco/download.sh
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd "$DIR"
-
-# Download the data.
-echo "Downloading..."
-wget http://images.cocodataset.org/zips/train2014.zip
-wget http://images.cocodataset.org/zips/val2014.zip
-wget http://images.cocodataset.org/zips/train2017.zip
-wget http://images.cocodataset.org/zips/val2017.zip
-wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
-wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
-# Extract the data.
-echo "Extracting..."
-unzip train2014.zip
-unzip val2014.zip
-unzip train2017.zip
-unzip val2017.zip
-unzip annotations_trainval2014.zip
-unzip annotations_trainval2017.zip
-
--- a/dataset/coco/download_coco.py
+++ b/dataset/coco/download_coco.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import os.path as osp
+import logging
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'coco')
--- a/dataset/fruit/download.sh
+++ b/dataset/fruit/download.sh
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd "$DIR"
-
-# Download the data.
-echo "Downloading..."
-wget https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar
-# Extract the data.
-echo "Extracting..."
-tar xvf fruit-detection.tar
-rm -rf fruit-detection.tar
--- a/dataset/fruit/download_fruit.py
+++ b/dataset/fruit/download_fruit.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import os.path as osp
+import logging
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'fruit')
--- a/dataset/voc/download.sh
+++ b/dataset/voc/download.sh
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd "$DIR"
-
-# Download the data.
-echo "Downloading..."
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
-# Extract the data.
-echo "Extracting..."
-tar -xf VOCtrainval_11-May-2012.tar
-tar -xf VOCtrainval_06-Nov-2007.tar
-tar -xf VOCtest_06-Nov-2007.tar
-
-echo "Creating data lists..."
-python -c 'from ppdet.utils.voc_utils import merge_and_create_list; merge_and_create_list("VOCdevkit", ["2007", "2012"], "VOCdevkit/VOC_all")'
--- a/dataset/voc/download_voc.py
+++ b/dataset/voc/download_voc.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import os.path as osp
+import logging
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'voc')
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@@ -110,15 +110,15 @@ On the other hand, to download the datasets, run the following commands:
 - COCO

 ```
-cd dataset/coco
-./download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/coco/download_coco.py
 ```

 - Pascal VOC

 ```
-cd dataset/voc
-./download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/voc/download_voc.py
 ```

 **Download datasets automatically:**

--- a/docs/INSTALL_cn.md
+++ b/docs/INSTALL_cn.md
@@ -109,15 +109,15 @@ ln -sf <path/to/voc> <path/to/paddle_detection>/dataset/voc
 - COCO

 ```
-cd dataset/coco
-./download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/coco/download_coco.py
 ```

 - Pascal VOC

 ```
-cd dataset/voc
-./download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/voc/download_voc.py
 ```

 **自动下载数据集：**

--- a/docs/QUICK_STARTED.md
+++ b/docs/QUICK_STARTED.md
@@ -6,11 +6,11 @@ This tutorial fine-tunes a tiny dataset by pretrained detection model for users

 ## Data Preparation

-Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download.sh](../dataset/fruit/download.sh). Command is as follows:
+Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download_fruit.py](../dataset/fruit/download_fruit.py). Command is as follows:

 ```bash
-cd dataset/fruit
-sh download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/fruit/download_fruit.py
 ```

 - **Note: before started, run the following command and specifiy the GPU**

--- a/docs/QUICK_STARTED_cn.md
+++ b/docs/QUICK_STARTED_cn.md
@@ -6,11 +6,11 @@

 ## 数据准备

-数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection)，其中训练数据集240张图片，测试数据集60张图片，数据类别为3类：苹果，橘子，香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download.sh](../dataset/fruit/download.sh)。下载数据方式如下：
+数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection)，其中训练数据集240张图片，测试数据集60张图片，数据类别为3类：苹果，橘子，香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download_fruit.py](../dataset/fruit/download_fruit.py)。下载数据方式如下：

 ```bash
-cd dataset/fruit
-sh download.sh
+export PYTHONPATH=$PYTHONPATH:.
+python dataset/fruit/download_fruit.py
 ```

 - **注：在开始前，运行如下命令并指定GPU**

--- a/ppdet/utils/download.py
+++ b/ppdet/utils/download.py
@@ -35,7 +35,7 @@ __all__ = ['get_weights_path', 'get_dataset_path']
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
 DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")

-# dict of {dataset_name: (downalod_info, sub_dirs)}
+# dict of {dataset_name: (download_info, sub_dirs)}
 # download info: (url, md5sum)
 DATASETS = {
    'coco': ([
@@ -60,6 +60,11 @@ DATASETS = {
            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
            'b6e924de25625d8de591ea690078ad9f', ),
    ], ["VOCdevkit/VOC_all"]),
+    'fruit': ([
+        (
+            'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar',
+            '374554a7633b1b68d6a5fbb7c061b8ba', ),
+    ], ["fruit-detection"]),
 }

 DOWNLOAD_RETRY_LIMIT = 3
@@ -103,9 +108,19 @@ def get_dataset_path(path, annotation, image_dir):

            # voc should merge dir and create list after download
            if name == 'voc':
+                _merge_voc_dir(data_dir, dataset[1][0])
+            return data_dir
+
+    # not match any dataset in DATASETS
+    raise ValueError("Dataset {} is not valid and cannot parse dataset type "
+                     "'{}' for automaticly downloading, which only supports "
+                     "'voc' and 'coco' currently".format(path, osp.split(path)[-1]))
+
+
+def _merge_voc_dir(data_dir, output_subdir):
    logger.info("Download voc dataset successed, merge "
                "VOC2007 and VOC2012 to VOC_all...")
-                output_dir = osp.join(data_dir, dataset[1][0])
+    output_dir = osp.join(data_dir, output_subdir)
    devkit_dir = "/".join(output_dir.split('/')[:-1])
    years = ['2007', '2012']
    # merge dir in output_tmp_dir at first, move to 
@@ -113,7 +128,7 @@ def get_dataset_path(path, annotation, image_dir):
    output_tmp_dir = osp.join(data_dir, 'tmp')
    if osp.isdir(output_tmp_dir):
        shutil.rmtree(output_tmp_dir)
-                # NOTE(dengkaipeng): since using auto download VOC
+    # NOTE: since using auto download VOC
    # dataset, VOC default label list should be used, 
    # do not generate label_list.txt here. For default
    # label, see ../data/source/voc_loader.py
@@ -122,12 +137,6 @@ def get_dataset_path(path, annotation, image_dir):
    # remove source directory VOC2007 and VOC2012
    shutil.rmtree(osp.join(devkit_dir, "VOC2007"))
    shutil.rmtree(osp.join(devkit_dir, "VOC2012"))
-            return data_dir
-
-    # not match any dataset in DATASETS
-    raise ValueError("Dataset {} is not valid and cannot parse dataset type "
-                     "'{}' for automaticly downloading, which only supports "
-                     "'voc' and 'coco' currently".format(path, osp.split(path)[-1]))


 def map_path(url, root_dir):
@@ -173,6 +182,19 @@ def get_path(url, root_dir, md5sum=None):
    return fullpath


+def download_dataset(path, dataset=None):
+    if dataset not in DATASETS.keys():
+        logger.error("Unknown dataset {}, it should be "
+                     "{}".format(dataset, DATASETS.keys()))
+        return
+    dataset_info = DATASETS[dataset][0]
+    for info in dataset_info:
+        get_path(info[0], path, info[1])
+    if dataset == 'voc':
+        _merge_voc_dir(path, DATASETS[dataset][1][0])
+    logger.info("Download dataset {} finished.".format(dataset))
+
+
 def _dataset_exists(path, annotation, image_dir):
    """
    Check if user define dataset exists