未验证 提交 82874d8f 编写于 作者: X Xiaoyao Xi 提交者: GitHub

Merge pull request #78 from wangxiao1021/api

update downloaders
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
from tqdm import tqdm import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
header = { percent = float(bytes_so_far) / float(total_size)
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' if percent > 1:
'70.0.3538.67 Safari/537.36' percent = 1
} print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')): ...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir) shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data')) shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests import sys
from tqdm import tqdm import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
header = { bytes_so_far = count * chunk_size
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' percent = float(bytes_so_far) / float(total_size)
'70.0.3538.67 Safari/537.36' if percent > 1:
} percent = 1
pbar = tqdm(total=file_size) print('\r>> Downloading... {:.1%}'.format(percent), end="")
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close() URLLIB.urlretrieve(url, src, reporthook=_reporthook)
return file_size
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
...@@ -32,3 +27,4 @@ if not os.path.exists(data_dir) or not os.path.isdir(data_dir): ...@@ -32,3 +27,4 @@ if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv") downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv")
download(downlaod_path, download_url) download(downlaod_path, download_url)
print(" done!")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
from tqdm import tqdm import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
header = { percent = float(bytes_so_far) / float(total_size)
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' if percent > 1:
'70.0.3538.67 Safari/537.36' percent = 1
} print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
...@@ -46,5 +39,5 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')): ...@@ -46,5 +39,5 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')):
shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir) shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data')) shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
from tqdm import tqdm import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
header = { percent = float(bytes_so_far) / float(total_size)
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' if percent > 1:
'70.0.3538.67 Safari/537.36' percent = 1
} print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
download_url = "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz" download_url = "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz"
...@@ -42,4 +35,4 @@ shutil.rmtree(os.path.join(target_dir, 'data/mrda/')) ...@@ -42,4 +35,4 @@ shutil.rmtree(os.path.join(target_dir, 'data/mrda/'))
shutil.rmtree(os.path.join(target_dir, 'data/multi-woz/')) shutil.rmtree(os.path.join(target_dir, 'data/multi-woz/'))
shutil.rmtree(os.path.join(target_dir, 'data/swda/')) shutil.rmtree(os.path.join(target_dir, 'data/swda/'))
shutil.rmtree(os.path.join(target_dir, 'data/udc/')) shutil.rmtree(os.path.join(target_dir, 'data/udc/'))
print(" done!")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
from tqdm import tqdm import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
header = { percent = float(bytes_so_far) / float(total_size)
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' if percent > 1:
'70.0.3538.67 Safari/537.36' percent = 1
} print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')): ...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir) shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data')) shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
from tqdm import tqdm import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
def download(src, url): def download(src, url):
file_size = int(requests.head(url).headers['Content-Length']) def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
header = { percent = float(bytes_so_far) / float(total_size)
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' if percent > 1:
'70.0.3538.67 Safari/537.36' percent = 1
} print('\r>> Downloading... {:.1%}'.format(percent), end="")
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
URLLIB.urlretrieve(url, src, reporthook=_reporthook)
abs_path = os.path.abspath(__file__) abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz" download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')): ...@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')):
shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir) shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data')) shutil.rmtree(os.path.join(target_dir, 'task_data'))
print(" done!")
...@@ -15,23 +15,18 @@ ...@@ -15,23 +15,18 @@
from __future__ import print_function from __future__ import print_function
import os import os
import requests
import tarfile import tarfile
import shutil import shutil
try:
from urllib.request import urlopen # Python 3
except ImportError:
from urllib2 import urlopen # Python 2
from collections import OrderedDict from collections import OrderedDict
import ssl import sys
import urllib
URLLIB=urllib
if sys.version_info >= (3, 0):
import urllib.request
URLLIB=urllib.request
__all__ = ["download", "ls"] __all__ = ["download", "ls"]
# for https
ssl._create_default_https_context = ssl._create_unverified_context
_pretrain = (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'), _pretrain = (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'),
('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'), ('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'),
('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'), ('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
...@@ -76,32 +71,15 @@ def _download(item, scope, path, silent=False, convert=False): ...@@ -76,32 +71,15 @@ def _download(item, scope, path, silent=False, convert=False):
filename = data_dir + '/' + data_name filename = data_dir + '/' + data_name
# print process # print process
def _chunk_report(bytes_so_far, total_size): def _reporthook(count, chunk_size, total_size):
bytes_so_far = count * chunk_size
percent = float(bytes_so_far) / float(total_size) percent = float(bytes_so_far) / float(total_size)
if percent > 1: if percent > 1:
percent = 1 percent = 1
if not silent: if not silent:
print('\r>> Downloading... {:.1%}'.format(percent), end = "") print('\r>> Downloading... {:.1%}'.format(percent), end = "")
# copy to local URLLIB.urlretrieve(data_url, filename, reporthook=_reporthook)
def _chunk_read(response, url, chunk_size = 16 * 1024, report_hook = None):
total_size = int(requests.head(url).headers['Content-Length'])
bytes_so_far = 0
with open("%s" % filename, "wb") as f:
while 1:
chunk = response.read(chunk_size)
f.write(chunk)
f.flush()
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, total_size)
return bytes_so_far
response = urlopen(data_url)
_chunk_read(response, data_url, report_hook=_chunk_report)
if not silent: if not silent:
print(' done!') print(' done!')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册