diff --git a/ernie/__init__.py b/ernie/__init__.py index 9171890f6a4c20827a09a78afbd829c30a9fca0f..f98ba817107815d2906bbba5f88eb91661794127 100644 --- a/ernie/__init__.py +++ b/ernie/__init__.py @@ -17,6 +17,9 @@ from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals +import sys +import logging + import paddle if paddle.__version__ != '0.0.0' and paddle.__version__ < '2.0.0': raise RuntimeError('propeller 0.2 requires paddle 2.0+, got %s' % @@ -28,3 +31,10 @@ from ernie.modeling_ernie import ( ErnieModelForQuestionAnswering, ErnieModelForPretraining) from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer + +log = logging.getLogger(__name__) +formatter = logging.Formatter(fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]: %(message)s') +stream_hdl = logging.StreamHandler(stream=sys.stderr) +stream_hdl.setFormatter(formatter) +log.addHandler(stream_hdl) +log.propagate = False diff --git a/ernie/file_utils.py b/ernie/file_utils.py index 03e2784f78c9e0c94e51b430daf9688d730233ff..40b5f0b1558c350d6162934b3986abfb568b064d 100644 --- a/ernie/file_utils.py +++ b/ernie/file_utils.py @@ -21,7 +21,6 @@ import logging from tqdm import tqdm from pathlib import Path import six -import paddle as P import time if six.PY2: from pathlib2 import Path @@ -35,8 +34,6 @@ def _fetch_from_remote(url, force_download=False, cached_dir='~/.paddle-ernie-cache'): import hashlib, tempfile, requests, tarfile - env = P.distributed.ParallelEnv() - sig = hashlib.md5(url.encode('utf8')).hexdigest() cached_dir = Path(cached_dir).expanduser() try: @@ -44,34 +41,31 @@ def _fetch_from_remote(url, except OSError: pass cached_dir_model = cached_dir / sig - done_file = cached_dir_model / 'fetch_done' - if force_download or not done_file.exists(): - if env.dev_id == 0: - cached_dir_model.mkdir() - tmpfile = cached_dir_model / 'tmp' - with tmpfile.open('wb') as f: - #url = 'https://ernie.bj.bcebos.com/ERNIE_stable.tgz' - r = requests.get(url, stream=True) - total_len = int(r.headers.get('content-length')) - for chunk in tqdm( - r.iter_content(chunk_size=1024), - total=total_len // 1024, - desc='downloading %s' % url, - unit='KB'): - if chunk: - f.write(chunk) - f.flush() - log.debug('extacting... to %s' % tmpfile) - with tarfile.open(tmpfile.as_posix()) as tf: - tf.extractall(path=cached_dir_model.as_posix()) - os.remove(tmpfile.as_posix()) - f = done_file.open('wb') - f.close() - else: - while not done_file.exists(): - time.sleep(1) + from filelock import FileLock + with FileLock(str(cached_dir_model) + '.lock'): + donefile = cached_dir_model / 'done' + if (not force_download) and donefile.exists(): + log.debug('%s cached in %s' % (url, cached_dir_model)) + return cached_dir_model + cached_dir_model.mkdir(exist_ok=True) + tmpfile = cached_dir_model / 'tmp' + with tmpfile.open('wb') as f: + r = requests.get(url, stream=True) + total_len = int(r.headers.get('content-length')) + for chunk in tqdm( + r.iter_content(chunk_size=1024), + total=total_len // 1024, + desc='downloading %s' % url, + unit='KB'): + if chunk: + f.write(chunk) + f.flush() + log.debug('extacting... to %s' % tmpfile) + with tarfile.open(tmpfile.as_posix()) as tf: + tf.extractall(path=str(cached_dir_model)) + donefile.touch() + os.remove(tmpfile.as_posix()) - log.debug('%s cached in %s' % (url, cached_dir)) return cached_dir_model diff --git a/ernie/modeling_ernie.py b/ernie/modeling_ernie.py index 65e76b5765124346382f9e0c8e772ed4db472f1f..e0d5a9f91f4be5979600bcfad9932179ba13bbd9 100644 --- a/ernie/modeling_ernie.py +++ b/ernie/modeling_ernie.py @@ -272,7 +272,7 @@ class PretrainedModel(object): pretrain_dir = Path(pretrain_dir_or_url) if not pretrain_dir.exists(): - raise ValueError('pretrain dir not found: %s' % pretrain_dir) + raise ValueError('pretrain dir not found: %s, optional: %s' % (pretrain_dir, cls.resource_map.keys())) state_dict_path = pretrain_dir / 'saved_weights.pdparams' config_path = pretrain_dir / 'ernie_config.json' diff --git a/ernie/tokenizing_ernie.py b/ernie/tokenizing_ernie.py index 7b866d84c281f5421890bfa2ba21dc5643817a1a..a9984ddd38ac8abd38e75864bf69135539488a0f 100644 --- a/ernie/tokenizing_ernie.py +++ b/ernie/tokenizing_ernie.py @@ -107,7 +107,7 @@ class ErnieTokenizer(object): (pretrain_dir_or_url, repr(cls.resource_map))) pretrain_dir = Path(pretrain_dir_or_url) if not pretrain_dir.exists(): - raise ValueError('pretrain dir not found: %s' % pretrain_dir) + raise ValueError('pretrain dir not found: %s, optional: %s' % (pretrain_dir, cls.resource_map.keys())) vocab_path = pretrain_dir / 'vocab.txt' if not vocab_path.exists(): raise ValueError('no vocab file in pretrain dir: %s' % diff --git a/requirements.txt b/requirements.txt index f2894e616c691aedd7a0e7882dd7bc89ac653dc9..aed1dc9b4a25a3fe71e9ea2352b7992475db04ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ numpy pyzmq==18.0.2 -six==1.11.0 +six>=1.11.0 sklearn==0.0 sentencepiece==0.1.8 jieba==0.39 visualdl>=2.0.0b7 pathlib2>=2.3.2 +filelock>=3.0.0 tqdm>=4.32.2