From 7860c6f0d835237cc603dce010425a6b912ecbc7 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Tue, 15 Sep 2020 17:17:09 +0800
Subject: [PATCH] use pathlib, fix #546 (#560)

* use pathib, fix #546

* use encoding

* Update feature_column.py

fix #559
---
 ernie/file_utils.py                     | 35 ++++++++++++++++++-------
 ernie/modeling_ernie.py                 | 23 +++++++++-------
 ernie/tokenizing_ernie.py               | 18 ++++++-------
 propeller/paddle/data/feature_column.py |  2 +-
 requirements.txt                        |  1 +
 setup.py                                |  3 ++-
 6 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/ernie/file_utils.py b/ernie/file_utils.py
index 1d4fd90..b63d091 100644
--- a/ernie/file_utils.py
+++ b/ernie/file_utils.py
@@ -19,27 +19,44 @@ from __future__ import unicode_literals
 import os
 import logging
 from tqdm import tqdm
+from pathlib import Path
+import six
+if six.PY2:
+    from pathlib2 import Path
+else:
+    from pathlib import Path
 
 log = logging.getLogger(__name__)
 
-def _fetch_from_remote(url, force_download=False):
+def _fetch_from_remote(url, force_download=False, cached_dir='~/.paddle-ernie-cache'):
     import hashlib, tempfile, requests, tarfile
     sig = hashlib.md5(url.encode('utf8')).hexdigest()
-    cached_dir = os.path.join(tempfile.gettempdir(), sig)
-    if force_download or not os.path.exists(cached_dir):
-        with tempfile.NamedTemporaryFile() as f:
+    cached_dir = Path(cached_dir).expanduser()
+    try:
+        cached_dir.mkdir()
+    except OSError:
+        pass
+    cached_dir_model = cached_dir / sig
+    if force_download or not cached_dir_model.exists():
+        cached_dir_model.mkdir()
+        tmpfile = cached_dir_model / 'tmp'
+        with tmpfile.open('wb') as f:
             #url = 'https://ernie.bj.bcebos.com/ERNIE_stable.tgz'
             r = requests.get(url, stream=True)
             total_len = int(r.headers.get('content-length'))
-            for chunk in tqdm(r.iter_content(chunk_size=1024), total=total_len // 1024, desc='downloading %s' % url, unit='KB'):
+            for chunk in tqdm(r.iter_content(chunk_size=1024), 
+                    total=total_len // 1024, 
+                    desc='downloading %s' % url, 
+                    unit='KB'):
                 if chunk:
                     f.write(chunk)  
                     f.flush()
-            log.debug('extacting... to %s' % f.name)
-            with tarfile.open(f.name)  as tf:
-                tf.extractall(path=cached_dir)
+            log.debug('extacting... to %s' % tmpfile)
+            with tarfile.open(tmpfile.as_posix())  as tf:
+                tf.extractall(path=cached_dir_model.as_posix())
+        os.remove(tmpfile.as_posix())
     log.debug('%s cached in %s' % (url, cached_dir))
-    return cached_dir
+    return cached_dir_model
 
 
 def add_docstring(doc):
diff --git a/ernie/modeling_ernie.py b/ernie/modeling_ernie.py
index 3e51a9e..781c93b 100644
--- a/ernie/modeling_ernie.py
+++ b/ernie/modeling_ernie.py
@@ -24,6 +24,11 @@ import json
 import logging
 import logging
 from functools import partial
+import six
+if six.PY2:
+    from pathlib2 import Path
+else:
+    from pathlib import Path
 
 import paddle.fluid.dygraph as D
 import paddle.fluid as F
@@ -191,7 +196,7 @@ class PretrainedModel(object):
     }
     @classmethod
     def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs):
-        if pretrain_dir_or_url in cls.resource_map:
+        if not Path(pretrain_dir_or_url).exists() and pretrain_dir_or_url in cls.resource_map:
             url = cls.resource_map[pretrain_dir_or_url]
             log.info('get pretrain dir from %s' % url)
             pretrain_dir = _fetch_from_remote(url, force_download)
@@ -199,16 +204,16 @@ class PretrainedModel(object):
             log.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map)))
             pretrain_dir = pretrain_dir_or_url
 
-        if not os.path.exists(pretrain_dir):
+        if not pretrain_dir.exists():
             raise ValueError('pretrain dir not found: %s' % pretrain_dir)
-        param_path = os.path.join(pretrain_dir, 'params')
-        state_dict_path = os.path.join(pretrain_dir, 'saved_weights')
-        config_path = os.path.join(pretrain_dir, 'ernie_config.json')
+        param_path = pretrain_dir /'params'
+        state_dict_path = pretrain_dir / 'saved_weights'
+        config_path = pretrain_dir / 'ernie_config.json'
 
-        if not os.path.exists(config_path):
+        if not config_path.exists():
             raise ValueError('config path not found: %s' % config_path)
         name_prefix=kwargs.pop('name', None)
-        cfg_dict = dict(json.loads(open(config_path).read()), **kwargs)
+        cfg_dict = dict(json.loads(config_path.open().read()), **kwargs)
         model = cls(cfg_dict, name=name_prefix)
         
         log.info('loading pretrained model from %s' % pretrain_dir)
@@ -217,8 +222,8 @@ class PretrainedModel(object):
         #    raise NotImplementedError()
         #    log.debug('load pretrained weight from program state')
         #    F.io.load_program_state(param_path) #buggy in dygraph.gurad, push paddle to fix
-        if os.path.exists(state_dict_path + '.pdparams'):
-            m, _ = D.load_dygraph(state_dict_path)
+        if state_dict_path.with_suffix('.pdparams').exists():
+            m, _ = D.load_dygraph(state_dict_path.as_posix())
             for k, v in model.state_dict().items():
                 if k not in m:
                     log.warn('param:%s not set in pretrained model, skip' % k)
diff --git a/ernie/tokenizing_ernie.py b/ernie/tokenizing_ernie.py
index 25ff1f5..8feef46 100644
--- a/ernie/tokenizing_ernie.py
+++ b/ernie/tokenizing_ernie.py
@@ -91,12 +91,12 @@ class ErnieTokenizer(object):
         else:
             log.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map)))
             pretrain_dir = pretrain_dir_or_url
-        if not os.path.exists(pretrain_dir):
+        if not pretrain_dir.exists():
             raise ValueError('pretrain dir not found: %s' % pretrain_dir)
-        vocab_path = os.path.join(pretrain_dir, 'vocab.txt')
-        if not os.path.exists(vocab_path):
+        vocab_path = pretrain_dir / 'vocab.txt'
+        if not vocab_path.exists():
             raise ValueError('no vocab file in pretrain dir: %s' % pretrain_dir)
-        vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(open(vocab_path).readlines())}
+        vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(vocab_path.open(encoding='utf8').readlines())}
         t = cls(vocab_dict, **kwargs)
         return t
 
@@ -207,14 +207,14 @@ class ErnieTinyTokenizer(ErnieTokenizer):
         else:
             log.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map)))
             pretrain_dir = pretrain_dir_or_url
-        if not os.path.exists(pretrain_dir):
+        if not pretrain_dir.exists():
             raise ValueError('pretrain dir not found: %s' % pretrain_dir)
-        vocab_path = os.path.join(pretrain_dir, 'vocab.txt')
-        sp_model_path = os.path.join(pretrain_dir, 'subword/spm_cased_simp_sampled.model')
+        vocab_path = pretrain_dir / 'vocab.txt'
+        sp_model_path = pretrain_dir / 'subword/spm_cased_simp_sampled.model'
 
-        if not os.path.exists(vocab_path):
+        if not vocab_path.exists():
             raise ValueError('no vocab file in pretrain dir: %s' % pretrain_dir)
-        vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(open(vocab_path).readlines())}
+        vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(vocab_path.open(encoding='utf8').readlines())}
 
         t = cls(vocab_dict, sp_model_path, **kwargs)
         return t
diff --git a/propeller/paddle/data/feature_column.py b/propeller/paddle/data/feature_column.py
index 83fdb7d..b4bbdc0 100644
--- a/propeller/paddle/data/feature_column.py
+++ b/propeller/paddle/data/feature_column.py
@@ -125,7 +125,7 @@ class LabelColumn(Column):
             ids = int(raw)
         else:
             ids = self.vocab[raw]
-        return ids
+        return np.array(ids, dtype=np.int64)
 
 
 class TextColumn(Column):
diff --git a/requirements.txt b/requirements.txt
index b3d8356..6ad03e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ sklearn==0.0
 sentencepiece==0.1.8
 jieba==0.39
 visualdl>=2.0.0b7
+pathlib2>=2.3.2
diff --git a/setup.py b/setup.py
index 8a91668..9251c36 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ with open("README.md", "r", encoding='utf-8') as fh:
 
 setuptools.setup(
     name="paddle-ernie", # Replace with your own username
-    version="0.0.4dev1",
+    version="0.0.5dev1",
     author="Baidu Ernie Team",
     author_email="ernieernie.team@gmail.com",
     description="A pretrained NLP model for every NLP tasks",
@@ -33,6 +33,7 @@ setuptools.setup(
     install_requires=[
         'requests',
         'tqdm',
+        'pathlib2',
         ],
     classifiers=[
         'Intended Audience :: Developers',
-- 
GitLab