Asset fetcher improvements

Use the avocado.utils.filelock to avoid race conditions: - Download the files with a temporary unique name. - Lock the original file. - Move downloaded file to the original name. - Compute the hash, creating the hashfile. - Verify the file against the provided hash. - Unlock the original file. (While the lock is acquired, users trying to use the file will wait until the lock is released or, on wait timeout, receive a cache miss) Drop EnviromentError exceptions on cache miss. Instead, we now only log an error message and return None. Clean debug messages. Log was being polluted by asset fetcher. Let's make it quieter. Reference: https://trello.com/c/NeFPMkZY Reference: https://trello.com/c/OWCprQpdSigned-off-by: N Amador Pahim <apahim@redhat.com>

Asset fetcher improvements
Use the avocado.utils.filelock to avoid race conditions: - Download the files with a temporary unique name. - Lock the original file. - Move downloaded file to the original name. - Compute the hash, creating the hashfile. - Verify the file against the provided hash. - Unlock the original file. (While the lock is acquired, users trying to use the file will wait until the lock is released or, on wait timeout, receive a cache miss) Drop EnviromentError exceptions on cache miss. Instead, we now only log an error message and return None. Clean debug messages. Log was being polluted by asset fetcher. Let's make it quieter. Reference: https://trello.com/c/NeFPMkZY Reference: https://trello.com/c/OWCprQpdSigned-off-by: N Amador Pahim <apahim@redhat.com>
3e0a6313 · Amador Pahim · 47799337 · 3e0a6313 · 3e0a6313 · 3e0a6313
隐藏空白更改
内联并排

Showing with 156 addition and 130 deletion

avocado/utils/asset.py avocado/utils/asset.py +114 -101

selftests/unit/test_utils_asset.py selftests/unit/test_utils_asset.py +41 -29

spell.ignore spell.ignore +1 -0

未找到文件。
--- a/avocado/utils/asset.py
+++ b/avocado/utils/asset.py
@@ -20,13 +20,17 @@ import errno
 import logging
 import os
 import re
+import shutil
 import stat
+import sys
 import time
+import tempfile
 import urlparse

 from . import crypto
 from . import path as utils_path
 from .download import url_download
+from .filelock import FileLock


 log = logging.getLogger('avocado.test')
@@ -40,8 +44,7 @@ class Asset(object):
    def __init__(self, name, asset_hash, algorithm, locations, cache_dirs,
                 expire):
        """
-        Initialize the Asset() and fetches the asset file. The path for
-        the fetched file can be reached using the self.path attribute.
+        Initialize the Asset() class.

        :param name: the asset filename. url is also supported
        :param asset_hash: asset hash
@@ -52,7 +55,10 @@ class Asset(object):
        """
        self.name = name
        self.asset_hash = asset_hash
-        self.algorithm = algorithm
+        if algorithm is None:
+            self.algorithm = 'sha1'
+        else:
+            self.algorithm = algorithm
        self.locations = locations
        self.cache_dirs = cache_dirs
        self.nameobj = urlparse.urlparse(self.name)
@@ -60,6 +66,13 @@ class Asset(object):
        self.expire = expire

    def fetch(self):
+        """
+        Fetches the asset. First tries to find the asset on the provided
+        cache_dirs list. Then tries to download the asset from the locations
+        list provided.
+
+        :returns: The path for the file on the cache directory.
+        """
        urls = []

        # If name is actually an url, it has to be included in urls list
@@ -70,129 +83,129 @@ class Asset(object):
        for cache_dir in self.cache_dirs:
            cache_dir = os.path.expanduser(cache_dir)
            self.asset_file = os.path.join(cache_dir, self.basename)
-            if (self._check_file(self.asset_file,
-               self.asset_hash, self.algorithm) and not
-               self._is_expired(self.asset_file, self.expire)):
-                return self.asset_file
-
-        # If we get to this point, file is not in any cache directory
-        # and we have to download it from a location. A rw cache
-        # directory is then needed. The first rw cache directory will be
-        # used.
-        log.debug("Looking for a writable cache dir.")
+            self.hashfile = '%s-CHECKSUM' % self.asset_file
+
+            # To use a cached file, it must:
+            # - Exists.
+            # - Be valid (not expired).
+            # - Be verified (hash check).
+            if (os.path.isfile(self.asset_file) and
+               not self._is_expired(self.asset_file, self.expire)):
+                try:
+                    with FileLock(self.asset_file, 1):
+                        if self._verify():
+                            return self.asset_file
+                except:
+                    exc_type, exc_value = sys.exc_info()[:2]
+                    log.error('%s: %s' % (exc_type.__name__, exc_value))
+
+        # If we get to this point, we have to download it from a location.
+        # A writable cache directory is then needed. The first available
+        # writable cache directory will be used.
        for cache_dir in self.cache_dirs:
            cache_dir = os.path.expanduser(cache_dir)
            self.asset_file = os.path.join(cache_dir, self.basename)
            if not utils_path.usable_rw_dir(cache_dir):
-                log.debug("Read-only cache dir '%s'. Skiping." %
-                          cache_dir)
                continue
-            log.debug("Using %s as cache dir." % cache_dir)

-            # Adding the user defined locations to the urls list
+            # Now we have a writable cache_dir. Let's get the asset.
+            # Adding the user defined locations to the urls list:
            if self.locations is not None:
                for item in self.locations:
                    urls.append(item)

            for url in urls:
                urlobj = urlparse.urlparse(url)
-                if urlobj.scheme == 'http' or urlobj.scheme == 'https':
-                    log.debug('Downloading from %s.' % url)
+                if urlobj.scheme in ['http', 'https', 'ftp']:
                    try:
-                        url_download(url, self.asset_file)
-                    except Exception as e:
-                        log.error(e)
-                        continue
-                    if self._check_file(self.asset_file, self.asset_hash,
-                                        self.algorithm):
-                        return self.asset_file
-
-                elif urlobj.scheme == 'ftp':
-                    log.debug('Downloading from %s.' % url)
-                    try:
-                        url_download(url, self.asset_file)
-                    except Exception as e:
-                        log.error(e)
-                        continue
-                    if self._check_file(self.asset_file, self.asset_hash,
-                                        self.algorithm):
-                        return self.asset_file
+                        if self._download(url):
+                            return self.asset_file
+                    except:
+                        exc_type, exc_value = sys.exc_info()[:2]
+                        log.error('%s: %s' % (exc_type.__name__, exc_value))

                elif urlobj.scheme == 'file':
+                    # Being flexible with the urlparse result
                    if os.path.isdir(urlobj.path):
                        path = os.path.join(urlobj.path, self.name)
                    else:
                        path = urlobj.path
-                    log.debug('Looking for file on %s.' % path)
-                    if self._check_file(path):
-                        try:
-                            os.symlink(path, self.asset_file)
-                        except OSError as e:
-                            if e.errno == errno.EEXIST:
-                                os.remove(self.asset_file)
-                                os.symlink(path, self.asset_file)
-                        log.debug('Symlink created %s -> %s.' %
-                                  (self.asset_file, path))
-
-                    else:
-                        continue
-                    if self._check_file(self.asset_file, self.asset_hash,
-                                        self.algorithm):
-                        return self.asset_file
-
-            raise EnvironmentError("Failed to fetch %s." % self.basename)
-        raise EnvironmentError("Can't find a writable cache dir.")

-    @staticmethod
-    def _check_file(path, filehash=None, algorithm=None):
-        """
-        Checks if file exists and verifies the hash, when the hash is
-        provided. We try first to find a hash file to verify the hash
-        against and only if the hash file is not present we compute the
-        hash.
-        """
-        if not os.path.isfile(path):
-            log.debug('Asset %s not found.' % path)
-            return False
-
-        if filehash is None:
+                    try:
+                        if self._get_local_file(path):
+                            return self.asset_file
+                    except:
+                        exc_type, exc_value = sys.exc_info()[:2]
+                        log.error('%s: %s' % (exc_type.__name__, exc_value))
+
+            # Despite our effort, we could not provide a healthy file. Sorry.
+            log.error("Failed to fetch %s." % self.basename)
+            return None
+
+        # Cannot find a writable cache_dir. Bye.
+        log.error("Can't find a writable cache dir.")
+        return None
+
+    def _download(self, url):
+        try:
+            # Temporary unique name to use while downloading
+            temp = '%s.%s' % (self.asset_file,
+                              next(tempfile._get_candidate_names()))
+            url_download(url, temp)
+
+            # Acquire lock only after download the file
+            with FileLock(self.asset_file, 1):
+                shutil.copy(temp, self.asset_file)
+                self._compute_hash()
+                return self._verify()
+        finally:
+            os.remove(temp)
+
+    def _compute_hash(self):
+        result = crypto.hash_file(self.asset_file, algorithm=self.algorithm)
+        basename = os.path.basename(self.asset_file)
+        with open(self.hashfile, 'w') as f:
+            f.write('%s %s\n' % (self.algorithm, result))
+
+    def _get_hash_from_file(self):
+        discovered = None
+        if not os.path.isfile(self.hashfile):
+            self._compute_hash()
+
+        with open(self.hashfile, 'r') as f:
+            for line in f.readlines():
+                # md5 is 32 chars big and sha512 is 128 chars big.
+                # others supported algorithms are between those.
+                pattern = '%s [a-f0-9]{32,128}' % self.algorithm
+                if re.match(pattern, line):
+                    discovered = line.split()[1]
+                    break
+        return discovered
+
+    def _verify(self):
+        if not self.asset_hash:
            return True
-
-        basename = os.path.basename(path)
-        discovered_hash = None
-        # Try to find a hashfile for the asset file
-        hashfile = '%s.%s' % (path, algorithm)
-        if os.path.isfile(hashfile):
-            with open(hashfile, 'r') as f:
-                for line in f.readlines():
-                    # md5 is 32 chars big and sha512 is 128 chars big.
-                    # others supported algorithms are between those.
-                    pattern = '[a-f0-9]{32,128} %s' % basename
-                    if re.match(pattern, line):
-                        log.debug('Hashfile found for %s.' % path)
-                        discovered_hash = line.split()[0]
-                        break
-
-        # If no hashfile, lets calculate the hash by ourselves
-        if discovered_hash is None:
-            log.debug('No hashfile found for %s. Computing hash.' %
-                      path)
-            discovered_hash = crypto.hash_file(path, algorithm=algorithm)
-
-            # Creating the hashfile for further usage.
-            log.debug('Creating hashfile %s.' % hashfile)
-            with open(hashfile, 'w') as f:
-                content = '%s %s\n' % (discovered_hash, basename)
-                f.write(content)
-
-        if filehash == discovered_hash:
-            log.debug('Asset %s verified.' % path)
+        if self._get_hash_from_file() == self.asset_hash:
            return True
        else:
-            log.error('Asset %s corrupted (hash expected:%s, hash found:%s).' %
-                      (path, filehash, discovered_hash))
            return False

+    def _get_local_file(self, path):
+        try:
+            with FileLock(self.asset_file, 1):
+                try:
+                    os.symlink(path, self.asset_file)
+                    self._compute_hash()
+                    return self._verify()
+                except OSError as e:
+                    if e.errno == errno.EEXIST:
+                        os.remove(self.asset_file)
+                        os.symlink(path, self.asset_file)
+                        self._compute_hash()
+                        return self._verify()
+        except:
+            raise
+
    @staticmethod
    def _is_expired(path, expire):
        if expire is None:

--- a/selftests/unit/test_utils_asset.py
+++ b/selftests/unit/test_utils_asset.py
@@ -4,6 +4,7 @@ import tempfile
 import unittest

 from avocado.utils import asset
+from avocado.utils.filelock import FileLock


 class TestAsset(unittest.TestCase):
@@ -28,12 +29,6 @@ class TestAsset(unittest.TestCase):
                                  expire=None).fetch()
        expected_tarball = os.path.join(self.cache_dir, self.assetname)
        self.assertEqual(foo_tarball, expected_tarball)
-        hashfile = '.'.join([expected_tarball, 'sha1'])
-        self.assertTrue(os.path.isfile(hashfile))
-        expected_content = '%s %s\n' % (self.assethash, self.assetname)
-        with open(hashfile, 'r') as f:
-            content = f.read()
-        self.assertEqual(content, expected_content)

    def testFetch_location(self):
        foo_tarball = asset.Asset(self.assetname,
@@ -44,12 +39,6 @@ class TestAsset(unittest.TestCase):
                                  expire=None).fetch()
        expected_tarball = os.path.join(self.cache_dir, self.assetname)
        self.assertEqual(foo_tarball, expected_tarball)
-        hashfile = '.'.join([expected_tarball, 'sha1'])
-        self.assertTrue(os.path.isfile(hashfile))
-        expected_content = '%s %s\n' % (self.assethash, self.assetname)
-        with open(hashfile, 'r') as f:
-            content = f.read()
-        self.assertEqual(content, expected_content)

    def testFecth_expire(self):
        foo_tarball = asset.Asset(self.assetname,
@@ -64,37 +53,60 @@ class TestAsset(unittest.TestCase):
        # Create the file in a different location with a different content
        new_assetdir = tempfile.mkdtemp(dir=self.basedir)
        new_localpath = os.path.join(new_assetdir, self.assetname)
+        new_hash = '9f1ad57044be4799f288222dc91d5eab152921e9'
        new_url = 'file://%s' % new_localpath
        with open(new_localpath, 'w') as f:
            f.write('Changed!')

        # Dont expire cached file
-        asset.Asset(self.assetname,
-                    asset_hash=self.assethash,
-                    algorithm='sha1',
-                    locations=[new_url],
-                    cache_dirs=[self.cache_dir],
-                    expire=None).fetch()
+        foo_tarball = asset.Asset(self.assetname,
+                                  asset_hash=self.assethash,
+                                  algorithm='sha1',
+                                  locations=[new_url],
+                                  cache_dirs=[self.cache_dir],
+                                  expire=None).fetch()
        with open(foo_tarball, 'r') as f:
            content2 = f.read()
        self.assertEqual(content1, content2)

        # Expire cached file
-        asset.Asset(self.assetname,
-                    asset_hash=self.assethash,
-                    algorithm='sha1',
-                    locations=[new_url],
-                    cache_dirs=[self.cache_dir],
-                    expire=-1).fetch()
+        foo_tarball = asset.Asset(self.assetname,
+                                  asset_hash=new_hash,
+                                  algorithm='sha1',
+                                  locations=[new_url],
+                                  cache_dirs=[self.cache_dir],
+                                  expire=-1).fetch()
+
        with open(foo_tarball, 'r') as f:
            content2 = f.read()
        self.assertNotEqual(content1, content2)

-    def testException(self):
-        a = asset.Asset(name='bar.tgz', asset_hash=None, algorithm=None,
-                        locations=None, cache_dirs=[self.cache_dir],
-                        expire=None)
-        self.assertRaises(EnvironmentError, a.fetch)
+    def testFetch_error(self):
+        foo_tarball = asset.Asset('bar.tgz',
+                                  asset_hash=self.assethash,
+                                  algorithm='sha1',
+                                  locations=None,
+                                  cache_dirs=[self.cache_dir],
+                                  expire=None).fetch()
+        self.assertEqual(foo_tarball, None)
+
+    def testFetch_lockerror(self):
+        with FileLock(os.path.join(self.cache_dir, self.assetname)):
+            foo_tarball = asset.Asset(self.url,
+                                      asset_hash=self.assethash,
+                                      algorithm='sha1',
+                                      locations=None,
+                                      cache_dirs=[self.cache_dir],
+                                      expire=None).fetch()
+            self.assertEqual(foo_tarball, None)
+
+        foo_tarball = asset.Asset(self.url,
+                                  asset_hash=self.assethash,
+                                  algorithm='sha1',
+                                  locations=None,
+                                  cache_dirs=[self.cache_dir],
+                                  expire=None).fetch()
+        self.assertNotEqual(foo_tarball, None)

    def tearDown(self):
        shutil.rmtree(self.basedir)

--- a/spell.ignore
+++ b/spell.ignore
@@ -395,3 +395,4 @@ isinstance
 dkrcmd
 flexmock
 lockfile
+urlparse