提交 7a5fd275 编写于 作者: W wangxiao1021

update downloaders

上级 1b9304dd
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
......@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
......@@ -32,3 +27,4 @@ if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv")
download(downlaod_path, download_url)
print(" done!")
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
......@@ -46,5 +39,5 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')):
shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
download_url = "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz"
......@@ -42,4 +35,4 @@ shutil.rmtree(os.path.join(target_dir, 'data/mrda/'))
shutil.rmtree(os.path.join(target_dir, 'data/multi-woz/'))
shutil.rmtree(os.path.join(target_dir, 'data/swda/'))
shutil.rmtree(os.path.join(target_dir, 'data/udc/'))
print(" done!")
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
......@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
print('\r>> Downloading... {:.1%}'.format(percent), end="")
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
......@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')):
shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
......@@ -15,23 +15,18 @@
from __future__ import print_function
import os
import requests
import tarfile
import shutil
try:
from urllib.request import urlopen # Python 3
except ImportError:
from urllib2 import urlopen # Python 2
from collections import OrderedDict
import ssl
import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
__all__ = ["download", "ls"]
# for https
ssl._create_default_https_context = ssl._create_unverified_context
_pretrain = (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'),
('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'),
('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
......@@ -76,32 +71,15 @@ def _download(item, scope, path, silent=False, convert=False):
filename = data_dir + '/' + data_name
# print process
def _chunk_report(bytes_so_far, total_size):
def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size)
if percent > 1:
percent = 1
if not silent:
print('\r>> Downloading... {:.1%}'.format(percent), end = "")
# copy to local
def _chunk_read(response, url, chunk_size = 16 * 1024, report_hook = None):
total_size = int(requests.head(url).headers['Content-Length'])
bytes_so_far = 0
with open("%s" % filename, "wb") as f:
while 1:
chunk = response.read(chunk_size)
f.write(chunk)
f.flush()
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, total_size)
return bytes_so_far
response = urlopen(data_url)
_chunk_read(response, data_url, report_hook=_chunk_report)
URLLIB.urlretrieve(data_url, filename, reporthook=_reporthook)
if not silent:
print(' done!')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册