Merge pull request #78 from wangxiao1021/api

update downloaders

Merge pull request #78 from wangxiao1021/api
update downloaders
82874d8f · Xiaoyao Xi · GitHub · 4dac1032 · 589a5f77 · 82874d8f
7 changed file
--- a/examples/classification/download.py
+++ b/examples/classification/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
-
-    pbar.close()
-    return file_size
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)

 abs_path = os.path.abspath(__file__)
 download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
    shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)

 shutil.rmtree(os.path.join(target_dir, 'task_data'))
-
-
+print(" done!")
--- a/examples/matching/download.py
+++ b/examples/matching/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

-    pbar.close()
-    return file_size
+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)


 abs_path = os.path.abspath(__file__)
@@ -32,3 +27,4 @@ if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
 download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
 downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv")
 download(downlaod_path, download_url)
+print(" done!")
--- a/examples/mrc/download.py
+++ b/examples/mrc/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
-
-    pbar.close()
-    return file_size
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)

 abs_path = os.path.abspath(__file__)
 download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
@@ -46,5 +39,5 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')):
    shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir)

 shutil.rmtree(os.path.join(target_dir, 'task_data'))
-
+print(" done!")

--- a/examples/multi-task/download.py
+++ b/examples/multi-task/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
-
-    pbar.close()
-    return file_size
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)

 abs_path = os.path.abspath(__file__)
 download_url = "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz"
@@ -42,4 +35,4 @@ shutil.rmtree(os.path.join(target_dir, 'data/mrda/'))
 shutil.rmtree(os.path.join(target_dir, 'data/multi-woz/'))
 shutil.rmtree(os.path.join(target_dir, 'data/swda/'))
 shutil.rmtree(os.path.join(target_dir, 'data/udc/'))
-
+print(" done!")
--- a/examples/predict/download.py
+++ b/examples/predict/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
-
-    pbar.close()
-    return file_size
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)

 abs_path = os.path.abspath(__file__)
 download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
    shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)

 shutil.rmtree(os.path.join(target_dir, 'task_data'))
-
-
+print(" done!")
--- a/examples/tagging/download.py
+++ b/examples/tagging/download.py
 #  -*- coding: utf-8 -*-
-
+from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-from tqdm import tqdm
-
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 def download(src, url):
-    file_size = int(requests.head(url).headers['Content-Length'])
-
-    header = {
-        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
-        '70.0.3538.67 Safari/537.36'
-    }
-    pbar = tqdm(total=file_size)
-    resp = requests.get(url, headers=header, stream=True)
-
-    with open(src, 'ab') as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-                pbar.update(1024)
-
-    pbar.close()
-    return file_size
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
+        percent = float(bytes_so_far) / float(total_size)
+        if percent > 1:
+            percent = 1
+        print('\r>> Downloading... {:.1%}'.format(percent), end="")

+    URLLIB.urlretrieve(url, src, reporthook=_reporthook)

 abs_path = os.path.abspath(__file__)
 download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
@@ -46,5 +39,4 @@ for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')):
    shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir)

 shutil.rmtree(os.path.join(target_dir, 'task_data'))
-
-
+print(" done!")
--- a/paddlepalm/_downloader.py
+++ b/paddlepalm/_downloader.py
@@ -15,23 +15,18 @@

 from __future__ import print_function
 import os
-import requests
 import tarfile
 import shutil
-try:
-    from urllib.request import urlopen # Python 3
-except ImportError:
-    from urllib2 import urlopen # Python 2
 from collections import OrderedDict
-import ssl
+import sys
+import urllib
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    import urllib.request
+    URLLIB=urllib.request

 __all__ = ["download", "ls"]

-# for https
-ssl._create_default_https_context = ssl._create_unverified_context
-
-
-
 _pretrain = (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'),
            ('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'),
            ('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
@@ -76,32 +71,15 @@ def _download(item, scope, path, silent=False, convert=False):
    filename = data_dir + '/' + data_name

    # print process
-    def _chunk_report(bytes_so_far, total_size):
+    def _reporthook(count, chunk_size, total_size):
+        bytes_so_far = count * chunk_size
        percent = float(bytes_so_far) / float(total_size)
        if percent > 1:
            percent = 1
        if not silent:
            print('\r>> Downloading... {:.1%}'.format(percent), end = "")
    
-    # copy to local
-    def _chunk_read(response, url, chunk_size = 16 * 1024, report_hook = None):
-        total_size = int(requests.head(url).headers['Content-Length'])
-        bytes_so_far = 0
-        with open("%s" % filename, "wb") as f:
-            while 1:
-                chunk = response.read(chunk_size)
-                f.write(chunk)
-                f.flush() 
-                bytes_so_far += len(chunk)
-                if not chunk:
-                    break
-                if report_hook:
-                    report_hook(bytes_so_far, total_size)
-        return bytes_so_far
-
-    response = urlopen(data_url)
-    _chunk_read(response, data_url, report_hook=_chunk_report)
-    
+    URLLIB.urlretrieve(data_url, filename, reporthook=_reporthook)
    if not silent:
        print(' done!')