update downloads.py (#3672)

f3cf41f0 · SYSU_BOND · pkpk · 89077088 · f3cf41f0 · f3cf41f0
4 changed file
--- a/PaddleNLP/lexical_analysis/README.md
+++ b/PaddleNLP/lexical_analysis/README.md
@@ -37,18 +37,18 @@ PaddlePaddle的版本要求是：Python 2 版本是 2.7.15+、Python 3 版本是
 本项目涉及的**数据集**和**预训练模型**的数据可通过执行以下脚本进行快速下载，若仅需使用部分数据，可根据需要参照下列介绍进行部分下载
 ```bash
-python download.py all
+python downloads.py all
 ```
 或在支持运行shell脚本的环境下执行：
 ```bash
-sh download.sh
+sh downloads.sh
 ```
 #### 2. 训练数据集
 下载数据集文件，解压后会生成 `./data/` 文件夹
 ```bash
-python download.py dataset
+python downloads.py dataset
 ```
 #### 3. 预训练模型
@@ -56,10 +56,10 @@ python download.py dataset
 我们开源了在自建数据集上训练的词法分析模型，可供用户直接使用，可通过下述链接进行下载:
 ```bash
 # download baseline model
-python download.py lac
+python downloads.py lac
 # download ERNIE finetuned model
-python download.py finetuned
+python downloads.py finetuned
 ```
 注：若需进行ERNIE Finetune训练，需自行下载  [ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型，下载链接为： [https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz)，下载后解压至 `./pretrained/` 目录下。
@@ -172,7 +172,7 @@ python inference_model.py \
    1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列
    2. 将句子序列中的特殊字符进行转换
    3. 依据词典获取词对应的整数索引
 ### 代码结构说明
 ```text
 .

--- a/PaddleNLP/lexical_analysis/downloads.py
+++ b/PaddleNLP/lexical_analysis/downloads.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Download script, download dataset and pretrain models.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import io
+import os
+import sys
+import time
+import hashlib
+import tarfile
+import requests
+FILE_INFO = {
+    'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/',
+    'DATA': {
+        'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
+        'md5': '71e4a9a36d0f0177929a1bccedca7dba'
+    },
+    'LAC_MODEL': {
+        'name': 'lexical_analysis-2.0.0.tar.gz',
+        'md5': "fc1daef00de9564083c7dc7b600504ca"
+    },
+    'ERNIE_MODEL': {
+        'name': 'ERNIE_stable-1.0.1.tar.gz',
+        'md5': "bab876a874b5374a78d7af93384d3bfa"
+    },
+    'FINETURN_MODEL': {
+        'name': 'lexical_analysis_finetuned-1.0.0.tar.gz',
+        'md5': "ee2c7614b06dcfd89561fbbdaac34342"
+    }
+}
+def usage():
+    desc = ("\nDownload datasets and pretrained models for LAC.\n"
+            "Usage:\n"
+            "   1. python download.py all\n"
+            "   2. python download.py dataset\n"
+            "   3. python download.py lac\n"
+            "   4. python download.py finetuned\n"
+            "   5. python download.py ernie\n")
+    print(desc)
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    with io.open(fname, "rb") as fin:
+        for chunk in iter(lambda: fin.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+def extract(fname, dir_path):
+    """
+    Extract tar.gz file
+    """
+    try:
+        tar = tarfile.open(fname, "r:gz")
+        file_names = tar.getnames()
+        for file_name in file_names:
+            tar.extract(file_name, dir_path)
+            print(file_name)
+        tar.close()
+    except Exception as e:
+        raise e
+def _download(url, filename, md5sum):
+    """
+    Download file and check md5
+    """
+    retry = 0
+    retry_limit = 3
+    chunk_size = 4096
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError(
+                "Cannot download dataset ({0}) with retry {1} times.".format(
+                    url, retry_limit))
+        try:
+            start = time.time()
+            size = 0
+            res = requests.get(url, stream=True)
+            filesize = int(res.headers['content-length'])
+            if res.status_code == 200:
+                print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024))
+                # save by chunk
+                with io.open(filename, "wb") as fout:
+                    for chunk in res.iter_content(chunk_size=chunk_size):
+                        if chunk:
+                            fout.write(chunk)
+                            size += len(chunk)
+                            pr = '>' * int(size * 50 / filesize)
+                            print(
+                                '\r[Process ]: %s%.2f%%' %
+                                (pr, float(size / filesize * 100)),
+                                end='')
+            end = time.time()
+            print("\n[CostTime]: %.2f s" % (end - start))
+        except Exception as e:
+            print(e)
+def download(name, dir_path):
+    url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name']
+    file_path = os.path.join(dir_path, FILE_INFO[name]['name'])
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    # download data
+    print("Downloading : %s" % name)
+    _download(url, file_path, FILE_INFO[name]['md5'])
+    # extract data
+    print("Extracting : %s" % file_path)
+    extract(file_path, dir_path)
+    os.remove(file_path)
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        usage()
+        sys.exit(1)
+    pwd = os.path.join(os.path.dirname(__file__), './')
+    ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained')
+    if sys.argv[1] == 'all':
+        download('DATA', pwd)
+        download('LAC_MODEL', pwd)
+        download('FINETURN_MODEL', pwd)
+        download('ERNIE_MODEL', ernie_dir)
+    if sys.argv[1] == "dataset":
+        download('DATA', pwd)
+    elif sys.argv[1] == "lac":
+        download('LAC_MODEL', pwd)
+    elif sys.argv[1] == "finetuned":
+        download('FINETURN_MODEL', pwd)
+    elif sys.argv[1] == "ernie":
+        download('ERNIE_MODEL', ernie_dir)
+    else:
+        usage()
--- a/PaddleNLP/lexical_analysis/reader.py
+++ b/PaddleNLP/lexical_analysis/reader.py
@@ -73,7 +73,7 @@ class Dataset(object):
    def get_num_examples(self, filename):
        """num of line of file"""
-        return sum(1 for line in open(filename, "r"))
+        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
    def word_to_ids(self, words):
        """convert word to word index"""
@@ -107,16 +107,17 @@ class Dataset(object):
            fread = io.open(filename, "r", encoding="utf-8")
            if mode == "infer":
                for line in fread:
-                    words= line.strip()
+                    words = line.strip()
                    word_ids = self.word_to_ids(words)
-                    yield (word_ids[0:max_seq_len],)
+                    yield (word_ids[0:max_seq_len], )
            else:
                headline = next(fread)
                headline = headline.strip().split('\t')
-                assert len(headline) == 2 and headline[0] == "text_a" and headline[1] == "label"
+                assert len(headline) == 2 and headline[
+                    0] == "text_a" and headline[1] == "label"
                for line in fread:
                    words, labels = line.strip("\n").split("\t")
-                    if len(words)<1:
+                    if len(words) < 1:
                        continue
                    word_ids = self.word_to_ids(words.split("\002"))
                    label_ids = self.label_to_ids(labels.split("\002"))

--- a/PaddleNLP/lexical_analysis/utils.py
+++ b/PaddleNLP/lexical_analysis/utils.py
@@ -48,19 +48,21 @@ class ArgumentGroup(object):
            help=help + ' Default: %(default)s.',
            **kwargs)
 def load_yaml(parser, file_name, **kwargs):
    with open(file_name) as f:
-        args = yaml.load(f, Loader=yaml.FullLoader)
+        args = yaml.load(f)
        for title in args:
            group = parser.add_argument_group(title=title, description='')
            for name in args[title]:
                _type = type(args[title][name]['val'])
-                _type = str2bool if _type==bool else _type
+                _type = str2bool if _type == bool else _type
                group.add_argument(
-                    "--"+name,
+                    "--" + name,
                    default=args[title][name]['val'],
                    type=_type,
-                    help=args[title][name]['meaning'] + ' Default: %(default)s.',
+                    help=args[title][name]['meaning'] +
+                    ' Default: %(default)s.',
                    **kwargs)
@@ -115,7 +117,9 @@ def parse_result(words, crf_decode, dataset):
    for sent_index in range(batch_size):
        begin, end = offset_list[sent_index], offset_list[sent_index + 1]
        sent = [dataset.id2word_dict[str(id[0])] for id in words[begin:end]]
-        tags = [dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]]
+        tags = [
+            dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
+        ]
        sent_out = []
        tags_out = []
@@ -128,7 +132,7 @@ def parse_result(words, crf_decode, dataset):
                continue
            # for the beginning of word
-            if tag.endswith("-B") or (tag == "O" and tags[ind-1]!="O"):
+            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
                sent_out.append(parital_word)
                tags_out.append(tag.split('-')[0])
                parital_word = sent[ind]
@@ -137,12 +141,13 @@ def parse_result(words, crf_decode, dataset):
            parital_word += sent[ind]
        # append the last word, except for len(tags)=0
-        if len(sent_out)<len(tags_out):
+        if len(sent_out) < len(tags_out):
            sent_out.append(parital_word)
-        batch_out.append([sent_out,tags_out])
+        batch_out.append([sent_out, tags_out])
    return batch_out
 def init_checkpoint(exe, init_checkpoint_path, main_program):
    """
    Init CheckPoint
@@ -165,6 +170,7 @@ def init_checkpoint(exe, init_checkpoint_path, main_program):
        predicate=existed_persitables)
    print("Load model from {}".format(init_checkpoint_path))
 def init_pretraining_params(exe,
                            pretraining_params_path,
                            main_program,