download.py 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import os.path as osp
import shutil
import requests
W
Wenyu 已提交
20
import subprocess
21
import hashlib
L
LielinJiang 已提交
22 23
import tarfile
import zipfile
24 25 26 27 28 29
import time

try:
    from tqdm import tqdm
except:

30
    class tqdm:
31 32 33 34 35 36 37 38 39
        def __init__(self, total=None):
            self.total = total
            self.n = 0

        def update(self, n):
            self.n += n
            if self.total is None:
                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
            else:
40 41 42
                sys.stderr.write(
                    "\r{0:.1f}%".format(100 * self.n / float(self.total))
                )
43 44 45 46 47 48 49 50 51 52
            sys.stderr.flush()

        def __enter__(self):
            return self

        def __exit__(self, exc_type, exc_val, exc_tb):
            sys.stderr.write('\n')


import logging
53

54 55
logger = logging.getLogger(__name__)

Z
zhiboniu 已提交
56
__all__ = ['get_weights_path_from_url']
57

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")

DOWNLOAD_RETRY_LIMIT = 3


def is_url(path):
    """
    Whether path is URL.
    Args:
        path (string): URL string or not.
    """
    return path.startswith('http://') or path.startswith('https://')


def get_weights_path_from_url(url, md5sum=None):
    """Get weights path from WEIGHT_HOME, if not exists,
    download it from url.

    Args:
        url (str): download url
        md5sum (str): md5 sum of download package
79

80 81 82 83 84 85
    Returns:
        str: a local path to save downloaded weights.

    Examples:
        .. code-block:: python

L
LielinJiang 已提交
86
            from paddle.utils.download import get_weights_path_from_url
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102

            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)

    """
    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
    return path


def _map_path(url, root_dir):
    # parse path after download under root_dir
    fname = osp.split(url)[-1]
    fpath = fname
    return osp.join(root_dir, fpath)


103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
def _get_unique_endpoints(trainer_endpoints):
    # Sorting is to avoid different environmental variables for each card
    trainer_endpoints.sort()
    ips = set()
    unique_endpoints = set()
    for endpoint in trainer_endpoints:
        ip = endpoint.split(":")[0]
        if ip in ips:
            continue
        ips.add(ip)
        unique_endpoints.add(endpoint)
    logger.info("unique_endpoints {}".format(unique_endpoints))
    return unique_endpoints


118 119 120 121
def get_path_from_url(
    url, root_dir, md5sum=None, check_exist=True, decompress=True, method='get'
):
    """Download from given url to root_dir.
122 123 124 125 126 127 128 129 130
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    Args:
        url (str): download url
        root_dir (str): root dir for downloading, it should be
                        WEIGHTS_HOME or DATASET_HOME
        md5sum (str): md5 sum of download package
W
Wenyu 已提交
131 132 133
        decompress (bool): decompress zip or tar file. Default is `True`
        method (str): which download method to use. Support `wget` and `get`. Default is `get`.

134 135 136
    Returns:
        str: a local path to save downloaded models & weights & datasets.
    """
137 138 139

    from paddle.fluid.dygraph.parallel import ParallelEnv

140 141 142
    assert is_url(url), "downloading from {} not a url".format(url)
    # parse path after download to decompress under root_dir
    fullpath = _map_path(url, root_dir)
143 144
    # Mainly used to solve the problem of downloading data from different
    # machines in the case of multiple machines. Different ips will download
145 146
    # data, and the same ip will only download data once.
    unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
147 148 149
    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
        logger.info("Found {}".format(fullpath))
    else:
150
        if ParallelEnv().current_endpoint in unique_endpoints:
W
Wenyu 已提交
151
            fullpath = _download(url, root_dir, md5sum, method=method)
152 153 154
        else:
            while not os.path.exists(fullpath):
                time.sleep(1)
L
LielinJiang 已提交
155

156
    if ParallelEnv().current_endpoint in unique_endpoints:
157 158 159
        if decompress and (
            tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath)
        ):
L
LielinJiang 已提交
160 161
            fullpath = _decompress(fullpath)

162 163 164
    return fullpath


W
Wenyu 已提交
165 166 167 168 169 170
def _get_download(url, fullname):
    # using requests.get method
    fname = osp.basename(fullname)
    try:
        req = requests.get(url, stream=True)
    except Exception as e:  # requests.exceptions.ConnectionError
171 172 173 174 175
        logger.info(
            "Downloading {} from {} failed with exception {}".format(
                fname, url, str(e)
            )
        )
W
Wenyu 已提交
176 177 178
        return False

    if req.status_code != 200:
179 180 181 182
        raise RuntimeError(
            "Downloading from {} failed with code "
            "{}!".format(url, req.status_code)
        )
W
Wenyu 已提交
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207

    # For protecting download interupted, download to
    # tmp_fullname firstly, move tmp_fullname to fullname
    # after download finished
    tmp_fullname = fullname + "_tmp"
    total_size = req.headers.get('content-length')
    with open(tmp_fullname, 'wb') as f:
        if total_size:
            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
                for chunk in req.iter_content(chunk_size=1024):
                    f.write(chunk)
                    pbar.update(1)
        else:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
    shutil.move(tmp_fullname, fullname)

    return fullname


def _wget_download(url, fullname):
    # using wget to download url
    tmp_fullname = fullname + "_tmp"
    # –user-agent
208 209 210 211 212 213
    command = 'wget -O {} -t {} {}'.format(
        tmp_fullname, DOWNLOAD_RETRY_LIMIT, url
    )
    subprc = subprocess.Popen(
        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
W
Wenyu 已提交
214 215 216 217
    _ = subprc.communicate()

    if subprc.returncode != 0:
        raise RuntimeError(
218 219 220 221
            '{} failed. Please make sure `wget` is installed or {} exists'.format(
                command, url
            )
        )
W
Wenyu 已提交
222 223 224 225 226 227 228 229 230 231 232 233 234

    shutil.move(tmp_fullname, fullname)

    return fullname


_download_methods = {
    'get': _get_download,
    'wget': _wget_download,
}


def _download(url, path, md5sum=None, method='get'):
235 236 237 238 239
    """
    Download from url, save to path.

    url (str): download url
    path (str): download to given path
W
Wenyu 已提交
240 241 242
    md5sum (str): md5 sum of download package
    method (str): which download method to use. Support `wget` and `get`. Default is `get`.

243
    """
W
Wenyu 已提交
244
    assert method in _download_methods, 'make sure `{}` implemented'.format(
245 246
        method
    )
W
Wenyu 已提交
247

248 249 250 251 252 253 254
    if not osp.exists(path):
        os.makedirs(path)

    fname = osp.split(url)[-1]
    fullname = osp.join(path, fname)
    retry_cnt = 0

W
Wenyu 已提交
255
    logger.info("Downloading {} from {}".format(fname, url))
256 257 258 259
    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
            retry_cnt += 1
        else:
260 261 262
            raise RuntimeError(
                "Download from {} failed. " "Retry limit reached".format(url)
            )
263

W
Wenyu 已提交
264
        if not _download_methods[method](url, fullname):
265 266 267
            time.sleep(1)
            continue

268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
    return fullname


def _md5check(fullname, md5sum=None):
    if md5sum is None:
        return True

    logger.info("File {} md5 checking...".format(fullname))
    md5 = hashlib.md5()
    with open(fullname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5.update(chunk)
    calc_md5sum = md5.hexdigest()

    if calc_md5sum != md5sum:
283 284 285 286
        logger.info(
            "File {} md5 check failed, {}(calc) != "
            "{}(base)".format(fullname, calc_md5sum, md5sum)
        )
287 288
        return False
    return True
L
LielinJiang 已提交
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312


def _decompress(fname):
    """
    Decompress for zip and tar file
    """
    logger.info("Decompressing {}...".format(fname))

    # For protecting decompressing interupted,
    # decompress to fpath_tmp directory firstly, if decompress
    # successed, move decompress files to fpath and delete
    # fpath_tmp and remove download compress file.

    if tarfile.is_tarfile(fname):
        uncompressed_path = _uncompress_file_tar(fname)
    elif zipfile.is_zipfile(fname):
        uncompressed_path = _uncompress_file_zip(fname)
    else:
        raise TypeError("Unsupport compress file type {}".format(fname))

    return uncompressed_path


def _uncompress_file_zip(filepath):
313 314
    with zipfile.ZipFile(filepath, 'r') as files:
        file_list = files.namelist()
L
LielinJiang 已提交
315

316
        file_dir = os.path.dirname(filepath)
L
LielinJiang 已提交
317

318 319 320 321
        if _is_a_single_file(file_list):
            rootpath = file_list[0]
            uncompressed_path = os.path.join(file_dir, rootpath)
            files.extractall(file_dir)
L
LielinJiang 已提交
322

323 324 325
        elif _is_a_single_dir(file_list):
            # `strip(os.sep)` to remove `os.sep` in the tail of path
            rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
326 327
                os.sep
            )[-1]
328
            uncompressed_path = os.path.join(file_dir, rootpath)
L
LielinJiang 已提交
329

330 331 332 333 334 335 336
            files.extractall(file_dir)
        else:
            rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
            uncompressed_path = os.path.join(file_dir, rootpath)
            if not os.path.exists(uncompressed_path):
                os.makedirs(uncompressed_path)
            files.extractall(os.path.join(file_dir, rootpath))
L
LielinJiang 已提交
337

338
        return uncompressed_path
L
LielinJiang 已提交
339 340 341


def _uncompress_file_tar(filepath, mode="r:*"):
342 343 344 345 346 347 348 349 350 351 352
    with tarfile.open(filepath, mode) as files:
        file_list = files.getnames()

        file_dir = os.path.dirname(filepath)

        if _is_a_single_file(file_list):
            rootpath = file_list[0]
            uncompressed_path = os.path.join(file_dir, rootpath)
            files.extractall(file_dir)
        elif _is_a_single_dir(file_list):
            rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
353 354
                os.sep
            )[-1]
355 356 357 358 359 360 361
            uncompressed_path = os.path.join(file_dir, rootpath)
            files.extractall(file_dir)
        else:
            rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
            uncompressed_path = os.path.join(file_dir, rootpath)
            if not os.path.exists(uncompressed_path):
                os.makedirs(uncompressed_path)
L
LielinJiang 已提交
362

363
            files.extractall(os.path.join(file_dir, rootpath))
L
LielinJiang 已提交
364

365
        return uncompressed_path
L
LielinJiang 已提交
366 367 368


def _is_a_single_file(file_list):
369
    if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
L
LielinJiang 已提交
370 371 372 373 374
        return True
    return False


def _is_a_single_dir(file_list):
S
Steffy-zxf 已提交
375 376 377 378 379 380 381 382 383 384 385
    new_file_list = []
    for file_path in file_list:
        if '/' in file_path:
            file_path = file_path.replace('/', os.sep)
        elif '\\' in file_path:
            file_path = file_path.replace('\\', os.sep)
        new_file_list.append(file_path)

    file_name = new_file_list[0].split(os.sep)[0]
    for i in range(1, len(new_file_list)):
        if file_name != new_file_list[i].split(os.sep)[0]:
L
LielinJiang 已提交
386 387
            return False
    return True