From 8116ac6f2ccc23153264b51450e773e0f2e2eafe Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Thu, 17 Oct 2019 11:25:47 +0800 Subject: [PATCH] Refine ppdet download (#3628) * refine download sh to py * update QUICK_STARTED --- dataset/coco/download.sh | 20 ----------- dataset/coco/download_coco.py | 25 +++++++++++++ dataset/fruit/download.sh | 10 ------ dataset/fruit/download_fruit.py | 25 +++++++++++++ dataset/voc/download.sh | 16 --------- dataset/voc/download_voc.py | 25 +++++++++++++ docs/INSTALL.md | 8 ++--- docs/INSTALL_cn.md | 8 ++--- docs/QUICK_STARTED.md | 6 ++-- docs/QUICK_STARTED_cn.md | 6 ++-- ppdet/utils/download.py | 62 ++++++++++++++++++++++----------- 11 files changed, 131 insertions(+), 80 deletions(-) delete mode 100644 dataset/coco/download.sh create mode 100644 dataset/coco/download_coco.py delete mode 100644 dataset/fruit/download.sh create mode 100644 dataset/fruit/download_fruit.py delete mode 100755 dataset/voc/download.sh create mode 100644 dataset/voc/download_voc.py diff --git a/dataset/coco/download.sh b/dataset/coco/download.sh deleted file mode 100644 index 6f262cceb..000000000 --- a/dataset/coco/download.sh +++ /dev/null @@ -1,20 +0,0 @@ -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd "$DIR" - -# Download the data. -echo "Downloading..." -wget http://images.cocodataset.org/zips/train2014.zip -wget http://images.cocodataset.org/zips/val2014.zip -wget http://images.cocodataset.org/zips/train2017.zip -wget http://images.cocodataset.org/zips/val2017.zip -wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip -wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -# Extract the data. -echo "Extracting..." -unzip train2014.zip -unzip val2014.zip -unzip train2017.zip -unzip val2017.zip -unzip annotations_trainval2014.zip -unzip annotations_trainval2017.zip - diff --git a/dataset/coco/download_coco.py b/dataset/coco/download_coco.py new file mode 100644 index 000000000..2b4f7e764 --- /dev/null +++ b/dataset/coco/download_coco.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'coco') diff --git a/dataset/fruit/download.sh b/dataset/fruit/download.sh deleted file mode 100644 index 2ea8d72c2..000000000 --- a/dataset/fruit/download.sh +++ /dev/null @@ -1,10 +0,0 @@ -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd "$DIR" - -# Download the data. -echo "Downloading..." -wget https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar -# Extract the data. -echo "Extracting..." -tar xvf fruit-detection.tar -rm -rf fruit-detection.tar diff --git a/dataset/fruit/download_fruit.py b/dataset/fruit/download_fruit.py new file mode 100644 index 000000000..5cce18895 --- /dev/null +++ b/dataset/fruit/download_fruit.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'fruit') diff --git a/dataset/voc/download.sh b/dataset/voc/download.sh deleted file mode 100755 index 2c7341a41..000000000 --- a/dataset/voc/download.sh +++ /dev/null @@ -1,16 +0,0 @@ -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd "$DIR" - -# Download the data. -echo "Downloading..." -wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar -wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar -wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar -# Extract the data. -echo "Extracting..." -tar -xf VOCtrainval_11-May-2012.tar -tar -xf VOCtrainval_06-Nov-2007.tar -tar -xf VOCtest_06-Nov-2007.tar - -echo "Creating data lists..." -python -c 'from ppdet.utils.voc_utils import merge_and_create_list; merge_and_create_list("VOCdevkit", ["2007", "2012"], "VOCdevkit/VOC_all")' diff --git a/dataset/voc/download_voc.py b/dataset/voc/download_voc.py new file mode 100644 index 000000000..e7f32657f --- /dev/null +++ b/dataset/voc/download_voc.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'voc') diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 19759bf3d..3f99b19b6 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -110,15 +110,15 @@ On the other hand, to download the datasets, run the following commands: - COCO ``` -cd dataset/coco -./download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/coco/download_coco.py ``` - Pascal VOC ``` -cd dataset/voc -./download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/voc/download_voc.py ``` **Download datasets automatically:** diff --git a/docs/INSTALL_cn.md b/docs/INSTALL_cn.md index 1003bb850..6fe1c484e 100644 --- a/docs/INSTALL_cn.md +++ b/docs/INSTALL_cn.md @@ -109,15 +109,15 @@ ln -sf /dataset/voc - COCO ``` -cd dataset/coco -./download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/coco/download_coco.py ``` - Pascal VOC ``` -cd dataset/voc -./download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/voc/download_voc.py ``` **自动下载数据集:** diff --git a/docs/QUICK_STARTED.md b/docs/QUICK_STARTED.md index d93f76b2d..5b687b35e 100644 --- a/docs/QUICK_STARTED.md +++ b/docs/QUICK_STARTED.md @@ -6,11 +6,11 @@ This tutorial fine-tunes a tiny dataset by pretrained detection model for users ## Data Preparation -Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download.sh](../dataset/fruit/download.sh). Command is as follows: +Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download_fruit.py](../dataset/fruit/download_fruit.py). Command is as follows: ```bash -cd dataset/fruit -sh download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/fruit/download_fruit.py ``` - **Note: before started, run the following command and specifiy the GPU** diff --git a/docs/QUICK_STARTED_cn.md b/docs/QUICK_STARTED_cn.md index fe15870b2..78c019a6b 100644 --- a/docs/QUICK_STARTED_cn.md +++ b/docs/QUICK_STARTED_cn.md @@ -6,11 +6,11 @@ ## 数据准备 -数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download.sh](../dataset/fruit/download.sh)。下载数据方式如下: +数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download_fruit.py](../dataset/fruit/download_fruit.py)。下载数据方式如下: ```bash -cd dataset/fruit -sh download.sh +export PYTHONPATH=$PYTHONPATH:. +python dataset/fruit/download_fruit.py ``` - **注:在开始前,运行如下命令并指定GPU** diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py index b40e1404d..05f627491 100644 --- a/ppdet/utils/download.py +++ b/ppdet/utils/download.py @@ -35,7 +35,7 @@ __all__ = ['get_weights_path', 'get_dataset_path'] WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") -# dict of {dataset_name: (downalod_info, sub_dirs)} +# dict of {dataset_name: (download_info, sub_dirs)} # download info: (url, md5sum) DATASETS = { 'coco': ([ @@ -60,6 +60,11 @@ DATASETS = { 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', 'b6e924de25625d8de591ea690078ad9f', ), ], ["VOCdevkit/VOC_all"]), + 'fruit': ([ + ( + 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar', + '374554a7633b1b68d6a5fbb7c061b8ba', ), + ], ["fruit-detection"]), } DOWNLOAD_RETRY_LIMIT = 3 @@ -103,25 +108,7 @@ def get_dataset_path(path, annotation, image_dir): # voc should merge dir and create list after download if name == 'voc': - logger.info("Download voc dataset successed, merge " - "VOC2007 and VOC2012 to VOC_all...") - output_dir = osp.join(data_dir, dataset[1][0]) - devkit_dir = "/".join(output_dir.split('/')[:-1]) - years = ['2007', '2012'] - # merge dir in output_tmp_dir at first, move to - # output_dir after merge sucessed. - output_tmp_dir = osp.join(data_dir, 'tmp') - if osp.isdir(output_tmp_dir): - shutil.rmtree(output_tmp_dir) - # NOTE(dengkaipeng): since using auto download VOC - # dataset, VOC default label list should be used, - # do not generate label_list.txt here. For default - # label, see ../data/source/voc_loader.py - merge_and_create_list(devkit_dir, years, output_tmp_dir) - shutil.move(output_tmp_dir, output_dir) - # remove source directory VOC2007 and VOC2012 - shutil.rmtree(osp.join(devkit_dir, "VOC2007")) - shutil.rmtree(osp.join(devkit_dir, "VOC2012")) + _merge_voc_dir(data_dir, dataset[1][0]) return data_dir # not match any dataset in DATASETS @@ -130,6 +117,28 @@ def get_dataset_path(path, annotation, image_dir): "'voc' and 'coco' currently".format(path, osp.split(path)[-1])) +def _merge_voc_dir(data_dir, output_subdir): + logger.info("Download voc dataset successed, merge " + "VOC2007 and VOC2012 to VOC_all...") + output_dir = osp.join(data_dir, output_subdir) + devkit_dir = "/".join(output_dir.split('/')[:-1]) + years = ['2007', '2012'] + # merge dir in output_tmp_dir at first, move to + # output_dir after merge sucessed. + output_tmp_dir = osp.join(data_dir, 'tmp') + if osp.isdir(output_tmp_dir): + shutil.rmtree(output_tmp_dir) + # NOTE: since using auto download VOC + # dataset, VOC default label list should be used, + # do not generate label_list.txt here. For default + # label, see ../data/source/voc_loader.py + merge_and_create_list(devkit_dir, years, output_tmp_dir) + shutil.move(output_tmp_dir, output_dir) + # remove source directory VOC2007 and VOC2012 + shutil.rmtree(osp.join(devkit_dir, "VOC2007")) + shutil.rmtree(osp.join(devkit_dir, "VOC2012")) + + def map_path(url, root_dir): # parse path after download to decompress under root_dir fname = url.split('/')[-1] @@ -173,6 +182,19 @@ def get_path(url, root_dir, md5sum=None): return fullpath +def download_dataset(path, dataset=None): + if dataset not in DATASETS.keys(): + logger.error("Unknown dataset {}, it should be " + "{}".format(dataset, DATASETS.keys())) + return + dataset_info = DATASETS[dataset][0] + for info in dataset_info: + get_path(info[0], path, info[1]) + if dataset == 'voc': + _merge_voc_dir(path, DATASETS[dataset][1][0]) + logger.info("Download dataset {} finished.".format(dataset)) + + def _dataset_exists(path, annotation, image_dir): """ Check if user define dataset exists -- GitLab