提交 8116ac6f 编写于 作者: K Kaipeng Deng 提交者: GitHub

Refine ppdet download (#3628)

* refine download sh to py

* update QUICK_STARTED
上级 db627af7
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget http://images.cocodataset.org/zips/train2014.zip
wget http://images.cocodataset.org/zips/val2014.zip
wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
# Extract the data.
echo "Extracting..."
unzip train2014.zip
unzip val2014.zip
unzip train2017.zip
unzip val2017.zip
unzip annotations_trainval2014.zip
unzip annotations_trainval2017.zip
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'coco')
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar
# Extract the data.
echo "Extracting..."
tar xvf fruit-detection.tar
rm -rf fruit-detection.tar
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'fruit')
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
# Extract the data.
echo "Extracting..."
tar -xf VOCtrainval_11-May-2012.tar
tar -xf VOCtrainval_06-Nov-2007.tar
tar -xf VOCtest_06-Nov-2007.tar
echo "Creating data lists..."
python -c 'from ppdet.utils.voc_utils import merge_and_create_list; merge_and_create_list("VOCdevkit", ["2007", "2012"], "VOCdevkit/VOC_all")'
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'voc')
......@@ -110,15 +110,15 @@ On the other hand, to download the datasets, run the following commands:
- COCO
```
cd dataset/coco
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/coco/download_coco.py
```
- Pascal VOC
```
cd dataset/voc
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/voc/download_voc.py
```
**Download datasets automatically:**
......
......@@ -109,15 +109,15 @@ ln -sf <path/to/voc> <path/to/paddle_detection>/dataset/voc
- COCO
```
cd dataset/coco
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/coco/download_coco.py
```
- Pascal VOC
```
cd dataset/voc
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/voc/download_voc.py
```
**自动下载数据集:**
......
......@@ -6,11 +6,11 @@ This tutorial fine-tunes a tiny dataset by pretrained detection model for users
## Data Preparation
Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download.sh](../dataset/fruit/download.sh). Command is as follows:
Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download_fruit.py](../dataset/fruit/download_fruit.py). Command is as follows:
```bash
cd dataset/fruit
sh download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/fruit/download_fruit.py
```
- **Note: before started, run the following command and specifiy the GPU**
......
......@@ -6,11 +6,11 @@
## 数据准备
数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download.sh](../dataset/fruit/download.sh)。下载数据方式如下:
数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download_fruit.py](../dataset/fruit/download_fruit.py)。下载数据方式如下:
```bash
cd dataset/fruit
sh download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/fruit/download_fruit.py
```
- **注:在开始前,运行如下命令并指定GPU**
......
......@@ -35,7 +35,7 @@ __all__ = ['get_weights_path', 'get_dataset_path']
WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
# dict of {dataset_name: (downalod_info, sub_dirs)}
# dict of {dataset_name: (download_info, sub_dirs)}
# download info: (url, md5sum)
DATASETS = {
'coco': ([
......@@ -60,6 +60,11 @@ DATASETS = {
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
'b6e924de25625d8de591ea690078ad9f', ),
], ["VOCdevkit/VOC_all"]),
'fruit': ([
(
'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar',
'374554a7633b1b68d6a5fbb7c061b8ba', ),
], ["fruit-detection"]),
}
DOWNLOAD_RETRY_LIMIT = 3
......@@ -103,25 +108,7 @@ def get_dataset_path(path, annotation, image_dir):
# voc should merge dir and create list after download
if name == 'voc':
logger.info("Download voc dataset successed, merge "
"VOC2007 and VOC2012 to VOC_all...")
output_dir = osp.join(data_dir, dataset[1][0])
devkit_dir = "/".join(output_dir.split('/')[:-1])
years = ['2007', '2012']
# merge dir in output_tmp_dir at first, move to
# output_dir after merge sucessed.
output_tmp_dir = osp.join(data_dir, 'tmp')
if osp.isdir(output_tmp_dir):
shutil.rmtree(output_tmp_dir)
# NOTE(dengkaipeng): since using auto download VOC
# dataset, VOC default label list should be used,
# do not generate label_list.txt here. For default
# label, see ../data/source/voc_loader.py
merge_and_create_list(devkit_dir, years, output_tmp_dir)
shutil.move(output_tmp_dir, output_dir)
# remove source directory VOC2007 and VOC2012
shutil.rmtree(osp.join(devkit_dir, "VOC2007"))
shutil.rmtree(osp.join(devkit_dir, "VOC2012"))
_merge_voc_dir(data_dir, dataset[1][0])
return data_dir
# not match any dataset in DATASETS
......@@ -130,6 +117,28 @@ def get_dataset_path(path, annotation, image_dir):
"'voc' and 'coco' currently".format(path, osp.split(path)[-1]))
def _merge_voc_dir(data_dir, output_subdir):
logger.info("Download voc dataset successed, merge "
"VOC2007 and VOC2012 to VOC_all...")
output_dir = osp.join(data_dir, output_subdir)
devkit_dir = "/".join(output_dir.split('/')[:-1])
years = ['2007', '2012']
# merge dir in output_tmp_dir at first, move to
# output_dir after merge sucessed.
output_tmp_dir = osp.join(data_dir, 'tmp')
if osp.isdir(output_tmp_dir):
shutil.rmtree(output_tmp_dir)
# NOTE: since using auto download VOC
# dataset, VOC default label list should be used,
# do not generate label_list.txt here. For default
# label, see ../data/source/voc_loader.py
merge_and_create_list(devkit_dir, years, output_tmp_dir)
shutil.move(output_tmp_dir, output_dir)
# remove source directory VOC2007 and VOC2012
shutil.rmtree(osp.join(devkit_dir, "VOC2007"))
shutil.rmtree(osp.join(devkit_dir, "VOC2012"))
def map_path(url, root_dir):
# parse path after download to decompress under root_dir
fname = url.split('/')[-1]
......@@ -173,6 +182,19 @@ def get_path(url, root_dir, md5sum=None):
return fullpath
def download_dataset(path, dataset=None):
if dataset not in DATASETS.keys():
logger.error("Unknown dataset {}, it should be "
"{}".format(dataset, DATASETS.keys()))
return
dataset_info = DATASETS[dataset][0]
for info in dataset_info:
get_path(info[0], path, info[1])
if dataset == 'voc':
_merge_voc_dir(path, DATASETS[dataset][1][0])
logger.info("Download dataset {} finished.".format(dataset))
def _dataset_exists(path, annotation, image_dir):
"""
Check if user define dataset exists
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册