提交 3e0a6313 编写于 作者: A Amador Pahim

Asset fetcher improvements

Use the avocado.utils.filelock to avoid race conditions:
 - Download the files with a temporary unique name.
 - Lock the original file.
 - Move downloaded file to the original name.
 - Compute the hash, creating the hashfile.
 - Verify the file against the provided hash.
 - Unlock the original file.
 (While the lock is acquired, users trying to use the file will wait until
 the lock is released or, on wait timeout, receive a cache miss)

Drop EnviromentError exceptions on cache miss. Instead, we now only
log an error message and return None.

Clean debug messages. Log was being polluted by asset fetcher. Let's
make it quieter.

Reference: https://trello.com/c/NeFPMkZY
Reference: https://trello.com/c/OWCprQpdSigned-off-by: NAmador Pahim <apahim@redhat.com>
上级 47799337
......@@ -20,13 +20,17 @@ import errno
import logging
import os
import re
import shutil
import stat
import sys
import time
import tempfile
import urlparse
from . import crypto
from . import path as utils_path
from .download import url_download
from .filelock import FileLock
log = logging.getLogger('avocado.test')
......@@ -40,8 +44,7 @@ class Asset(object):
def __init__(self, name, asset_hash, algorithm, locations, cache_dirs,
expire):
"""
Initialize the Asset() and fetches the asset file. The path for
the fetched file can be reached using the self.path attribute.
Initialize the Asset() class.
:param name: the asset filename. url is also supported
:param asset_hash: asset hash
......@@ -52,7 +55,10 @@ class Asset(object):
"""
self.name = name
self.asset_hash = asset_hash
self.algorithm = algorithm
if algorithm is None:
self.algorithm = 'sha1'
else:
self.algorithm = algorithm
self.locations = locations
self.cache_dirs = cache_dirs
self.nameobj = urlparse.urlparse(self.name)
......@@ -60,6 +66,13 @@ class Asset(object):
self.expire = expire
def fetch(self):
"""
Fetches the asset. First tries to find the asset on the provided
cache_dirs list. Then tries to download the asset from the locations
list provided.
:returns: The path for the file on the cache directory.
"""
urls = []
# If name is actually an url, it has to be included in urls list
......@@ -70,129 +83,129 @@ class Asset(object):
for cache_dir in self.cache_dirs:
cache_dir = os.path.expanduser(cache_dir)
self.asset_file = os.path.join(cache_dir, self.basename)
if (self._check_file(self.asset_file,
self.asset_hash, self.algorithm) and not
self._is_expired(self.asset_file, self.expire)):
return self.asset_file
# If we get to this point, file is not in any cache directory
# and we have to download it from a location. A rw cache
# directory is then needed. The first rw cache directory will be
# used.
log.debug("Looking for a writable cache dir.")
self.hashfile = '%s-CHECKSUM' % self.asset_file
# To use a cached file, it must:
# - Exists.
# - Be valid (not expired).
# - Be verified (hash check).
if (os.path.isfile(self.asset_file) and
not self._is_expired(self.asset_file, self.expire)):
try:
with FileLock(self.asset_file, 1):
if self._verify():
return self.asset_file
except:
exc_type, exc_value = sys.exc_info()[:2]
log.error('%s: %s' % (exc_type.__name__, exc_value))
# If we get to this point, we have to download it from a location.
# A writable cache directory is then needed. The first available
# writable cache directory will be used.
for cache_dir in self.cache_dirs:
cache_dir = os.path.expanduser(cache_dir)
self.asset_file = os.path.join(cache_dir, self.basename)
if not utils_path.usable_rw_dir(cache_dir):
log.debug("Read-only cache dir '%s'. Skiping." %
cache_dir)
continue
log.debug("Using %s as cache dir." % cache_dir)
# Adding the user defined locations to the urls list
# Now we have a writable cache_dir. Let's get the asset.
# Adding the user defined locations to the urls list:
if self.locations is not None:
for item in self.locations:
urls.append(item)
for url in urls:
urlobj = urlparse.urlparse(url)
if urlobj.scheme == 'http' or urlobj.scheme == 'https':
log.debug('Downloading from %s.' % url)
if urlobj.scheme in ['http', 'https', 'ftp']:
try:
url_download(url, self.asset_file)
except Exception as e:
log.error(e)
continue
if self._check_file(self.asset_file, self.asset_hash,
self.algorithm):
return self.asset_file
elif urlobj.scheme == 'ftp':
log.debug('Downloading from %s.' % url)
try:
url_download(url, self.asset_file)
except Exception as e:
log.error(e)
continue
if self._check_file(self.asset_file, self.asset_hash,
self.algorithm):
return self.asset_file
if self._download(url):
return self.asset_file
except:
exc_type, exc_value = sys.exc_info()[:2]
log.error('%s: %s' % (exc_type.__name__, exc_value))
elif urlobj.scheme == 'file':
# Being flexible with the urlparse result
if os.path.isdir(urlobj.path):
path = os.path.join(urlobj.path, self.name)
else:
path = urlobj.path
log.debug('Looking for file on %s.' % path)
if self._check_file(path):
try:
os.symlink(path, self.asset_file)
except OSError as e:
if e.errno == errno.EEXIST:
os.remove(self.asset_file)
os.symlink(path, self.asset_file)
log.debug('Symlink created %s -> %s.' %
(self.asset_file, path))
else:
continue
if self._check_file(self.asset_file, self.asset_hash,
self.algorithm):
return self.asset_file
raise EnvironmentError("Failed to fetch %s." % self.basename)
raise EnvironmentError("Can't find a writable cache dir.")
@staticmethod
def _check_file(path, filehash=None, algorithm=None):
"""
Checks if file exists and verifies the hash, when the hash is
provided. We try first to find a hash file to verify the hash
against and only if the hash file is not present we compute the
hash.
"""
if not os.path.isfile(path):
log.debug('Asset %s not found.' % path)
return False
if filehash is None:
try:
if self._get_local_file(path):
return self.asset_file
except:
exc_type, exc_value = sys.exc_info()[:2]
log.error('%s: %s' % (exc_type.__name__, exc_value))
# Despite our effort, we could not provide a healthy file. Sorry.
log.error("Failed to fetch %s." % self.basename)
return None
# Cannot find a writable cache_dir. Bye.
log.error("Can't find a writable cache dir.")
return None
def _download(self, url):
try:
# Temporary unique name to use while downloading
temp = '%s.%s' % (self.asset_file,
next(tempfile._get_candidate_names()))
url_download(url, temp)
# Acquire lock only after download the file
with FileLock(self.asset_file, 1):
shutil.copy(temp, self.asset_file)
self._compute_hash()
return self._verify()
finally:
os.remove(temp)
def _compute_hash(self):
result = crypto.hash_file(self.asset_file, algorithm=self.algorithm)
basename = os.path.basename(self.asset_file)
with open(self.hashfile, 'w') as f:
f.write('%s %s\n' % (self.algorithm, result))
def _get_hash_from_file(self):
discovered = None
if not os.path.isfile(self.hashfile):
self._compute_hash()
with open(self.hashfile, 'r') as f:
for line in f.readlines():
# md5 is 32 chars big and sha512 is 128 chars big.
# others supported algorithms are between those.
pattern = '%s [a-f0-9]{32,128}' % self.algorithm
if re.match(pattern, line):
discovered = line.split()[1]
break
return discovered
def _verify(self):
if not self.asset_hash:
return True
basename = os.path.basename(path)
discovered_hash = None
# Try to find a hashfile for the asset file
hashfile = '%s.%s' % (path, algorithm)
if os.path.isfile(hashfile):
with open(hashfile, 'r') as f:
for line in f.readlines():
# md5 is 32 chars big and sha512 is 128 chars big.
# others supported algorithms are between those.
pattern = '[a-f0-9]{32,128} %s' % basename
if re.match(pattern, line):
log.debug('Hashfile found for %s.' % path)
discovered_hash = line.split()[0]
break
# If no hashfile, lets calculate the hash by ourselves
if discovered_hash is None:
log.debug('No hashfile found for %s. Computing hash.' %
path)
discovered_hash = crypto.hash_file(path, algorithm=algorithm)
# Creating the hashfile for further usage.
log.debug('Creating hashfile %s.' % hashfile)
with open(hashfile, 'w') as f:
content = '%s %s\n' % (discovered_hash, basename)
f.write(content)
if filehash == discovered_hash:
log.debug('Asset %s verified.' % path)
if self._get_hash_from_file() == self.asset_hash:
return True
else:
log.error('Asset %s corrupted (hash expected:%s, hash found:%s).' %
(path, filehash, discovered_hash))
return False
def _get_local_file(self, path):
try:
with FileLock(self.asset_file, 1):
try:
os.symlink(path, self.asset_file)
self._compute_hash()
return self._verify()
except OSError as e:
if e.errno == errno.EEXIST:
os.remove(self.asset_file)
os.symlink(path, self.asset_file)
self._compute_hash()
return self._verify()
except:
raise
@staticmethod
def _is_expired(path, expire):
if expire is None:
......
......@@ -4,6 +4,7 @@ import tempfile
import unittest
from avocado.utils import asset
from avocado.utils.filelock import FileLock
class TestAsset(unittest.TestCase):
......@@ -28,12 +29,6 @@ class TestAsset(unittest.TestCase):
expire=None).fetch()
expected_tarball = os.path.join(self.cache_dir, self.assetname)
self.assertEqual(foo_tarball, expected_tarball)
hashfile = '.'.join([expected_tarball, 'sha1'])
self.assertTrue(os.path.isfile(hashfile))
expected_content = '%s %s\n' % (self.assethash, self.assetname)
with open(hashfile, 'r') as f:
content = f.read()
self.assertEqual(content, expected_content)
def testFetch_location(self):
foo_tarball = asset.Asset(self.assetname,
......@@ -44,12 +39,6 @@ class TestAsset(unittest.TestCase):
expire=None).fetch()
expected_tarball = os.path.join(self.cache_dir, self.assetname)
self.assertEqual(foo_tarball, expected_tarball)
hashfile = '.'.join([expected_tarball, 'sha1'])
self.assertTrue(os.path.isfile(hashfile))
expected_content = '%s %s\n' % (self.assethash, self.assetname)
with open(hashfile, 'r') as f:
content = f.read()
self.assertEqual(content, expected_content)
def testFecth_expire(self):
foo_tarball = asset.Asset(self.assetname,
......@@ -64,37 +53,60 @@ class TestAsset(unittest.TestCase):
# Create the file in a different location with a different content
new_assetdir = tempfile.mkdtemp(dir=self.basedir)
new_localpath = os.path.join(new_assetdir, self.assetname)
new_hash = '9f1ad57044be4799f288222dc91d5eab152921e9'
new_url = 'file://%s' % new_localpath
with open(new_localpath, 'w') as f:
f.write('Changed!')
# Dont expire cached file
asset.Asset(self.assetname,
asset_hash=self.assethash,
algorithm='sha1',
locations=[new_url],
cache_dirs=[self.cache_dir],
expire=None).fetch()
foo_tarball = asset.Asset(self.assetname,
asset_hash=self.assethash,
algorithm='sha1',
locations=[new_url],
cache_dirs=[self.cache_dir],
expire=None).fetch()
with open(foo_tarball, 'r') as f:
content2 = f.read()
self.assertEqual(content1, content2)
# Expire cached file
asset.Asset(self.assetname,
asset_hash=self.assethash,
algorithm='sha1',
locations=[new_url],
cache_dirs=[self.cache_dir],
expire=-1).fetch()
foo_tarball = asset.Asset(self.assetname,
asset_hash=new_hash,
algorithm='sha1',
locations=[new_url],
cache_dirs=[self.cache_dir],
expire=-1).fetch()
with open(foo_tarball, 'r') as f:
content2 = f.read()
self.assertNotEqual(content1, content2)
def testException(self):
a = asset.Asset(name='bar.tgz', asset_hash=None, algorithm=None,
locations=None, cache_dirs=[self.cache_dir],
expire=None)
self.assertRaises(EnvironmentError, a.fetch)
def testFetch_error(self):
foo_tarball = asset.Asset('bar.tgz',
asset_hash=self.assethash,
algorithm='sha1',
locations=None,
cache_dirs=[self.cache_dir],
expire=None).fetch()
self.assertEqual(foo_tarball, None)
def testFetch_lockerror(self):
with FileLock(os.path.join(self.cache_dir, self.assetname)):
foo_tarball = asset.Asset(self.url,
asset_hash=self.assethash,
algorithm='sha1',
locations=None,
cache_dirs=[self.cache_dir],
expire=None).fetch()
self.assertEqual(foo_tarball, None)
foo_tarball = asset.Asset(self.url,
asset_hash=self.assethash,
algorithm='sha1',
locations=None,
cache_dirs=[self.cache_dir],
expire=None).fetch()
self.assertNotEqual(foo_tarball, None)
def tearDown(self):
shutil.rmtree(self.basedir)
......
......@@ -395,3 +395,4 @@ isinstance
dkrcmd
flexmock
lockfile
urlparse
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册