From 7a5fd275b5b1def24dbae5bf303e8d3436703787 Mon Sep 17 00:00:00 2001 From: wangxiao1021 Date: Wed, 15 Apr 2020 15:20:38 +0800 Subject: [PATCH] update downloaders --- examples/classification/download.py | 38 +++++++++++---------------- examples/matching/download.py | 34 +++++++++++------------- examples/mrc/download.py | 37 +++++++++++--------------- examples/multi-task/download.py | 37 +++++++++++--------------- examples/predict/download.py | 38 +++++++++++---------------- examples/tagging/download.py | 38 +++++++++++---------------- paddlepalm/_downloader.py | 40 +++++++---------------------- 7 files changed, 99 insertions(+), 163 deletions(-) diff --git a/examples/classification/download.py b/examples/classification/download.py index fc6b52c..72435bb 100755 --- a/examples/classification/download.py +++ b/examples/classification/download.py @@ -1,31 +1,24 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests import tarfile import shutil -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - - pbar.close() - return file_size + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" @@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')): shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir) shutil.rmtree(os.path.join(target_dir, 'task_data')) - - +print(" done!") diff --git a/examples/matching/download.py b/examples/matching/download.py index 8ee8200..d27997d 100644 --- a/examples/matching/download.py +++ b/examples/matching/download.py @@ -1,27 +1,22 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") - pbar.close() - return file_size + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) @@ -32,3 +27,4 @@ if not os.path.exists(data_dir) or not os.path.isdir(data_dir): download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv") download(downlaod_path, download_url) +print(" done!") diff --git a/examples/mrc/download.py b/examples/mrc/download.py index 9209003..f46ddca 100755 --- a/examples/mrc/download.py +++ b/examples/mrc/download.py @@ -1,31 +1,24 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests import tarfile import shutil -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - - pbar.close() - return file_size + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" @@ -46,5 +39,5 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')): shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir) shutil.rmtree(os.path.join(target_dir, 'task_data')) - +print(" done!") diff --git a/examples/multi-task/download.py b/examples/multi-task/download.py index 57eaee9..dbcda78 100755 --- a/examples/multi-task/download.py +++ b/examples/multi-task/download.py @@ -1,31 +1,24 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests import tarfile import shutil -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - - pbar.close() - return file_size + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) download_url = "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz" @@ -42,4 +35,4 @@ shutil.rmtree(os.path.join(target_dir, 'data/mrda/')) shutil.rmtree(os.path.join(target_dir, 'data/multi-woz/')) shutil.rmtree(os.path.join(target_dir, 'data/swda/')) shutil.rmtree(os.path.join(target_dir, 'data/udc/')) - +print(" done!") diff --git a/examples/predict/download.py b/examples/predict/download.py index fc6b52c..72435bb 100755 --- a/examples/predict/download.py +++ b/examples/predict/download.py @@ -1,31 +1,24 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests import tarfile import shutil -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - - pbar.close() - return file_size + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" @@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')): shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir) shutil.rmtree(os.path.join(target_dir, 'task_data')) - - +print(" done!") diff --git a/examples/tagging/download.py b/examples/tagging/download.py index 5e7bd5b..5969f5d 100755 --- a/examples/tagging/download.py +++ b/examples/tagging/download.py @@ -1,31 +1,24 @@ # -*- coding: utf-8 -*- - +from __future__ import print_function import os -import requests import tarfile import shutil -from tqdm import tqdm - +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request def download(src, url): - file_size = int(requests.head(url).headers['Content-Length']) - - header = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' - '70.0.3538.67 Safari/537.36' - } - pbar = tqdm(total=file_size) - resp = requests.get(url, headers=header, stream=True) - - with open(src, 'ab') as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.update(1024) - - pbar.close() - return file_size + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size + percent = float(bytes_so_far) / float(total_size) + if percent > 1: + percent = 1 + print('\r>> Downloading... {:.1%}'.format(percent), end="") + URLLIB.urlretrieve(url, src, reporthook=_reporthook) abs_path = os.path.abspath(__file__) download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" @@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')): shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir) shutil.rmtree(os.path.join(target_dir, 'task_data')) - - +print(" done!") diff --git a/paddlepalm/_downloader.py b/paddlepalm/_downloader.py index a846957..2fa5a48 100644 --- a/paddlepalm/_downloader.py +++ b/paddlepalm/_downloader.py @@ -15,23 +15,18 @@ from __future__ import print_function import os -import requests import tarfile import shutil -try: - from urllib.request import urlopen # Python 3 -except ImportError: - from urllib2 import urlopen # Python 2 from collections import OrderedDict -import ssl +import sys +import urllib +URLLIB=urllib +if sys.version_info >= (3, 0): + import urllib.request + URLLIB=urllib.request __all__ = ["download", "ls"] -# for https -ssl._create_default_https_context = ssl._create_unverified_context - - - _pretrain = (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'), ('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'), ('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'), @@ -76,32 +71,15 @@ def _download(item, scope, path, silent=False, convert=False): filename = data_dir + '/' + data_name # print process - def _chunk_report(bytes_so_far, total_size): + def _reporthook(count, chunk_size, total_size): + bytes_so_far = count * chunk_size percent = float(bytes_so_far) / float(total_size) if percent > 1: percent = 1 if not silent: print('\r>> Downloading... {:.1%}'.format(percent), end = "") - # copy to local - def _chunk_read(response, url, chunk_size = 16 * 1024, report_hook = None): - total_size = int(requests.head(url).headers['Content-Length']) - bytes_so_far = 0 - with open("%s" % filename, "wb") as f: - while 1: - chunk = response.read(chunk_size) - f.write(chunk) - f.flush() - bytes_so_far += len(chunk) - if not chunk: - break - if report_hook: - report_hook(bytes_so_far, total_size) - return bytes_so_far - - response = urlopen(data_url) - _chunk_read(response, data_url, report_hook=_chunk_report) - + URLLIB.urlretrieve(data_url, filename, reporthook=_reporthook) if not silent: print(' done!') -- GitLab