提交 111c2a80 编写于 作者: C Cleber Rosa

Asset Fetcher: avoid clashes by using namespace dirs

The current implementation fails to cache multiple files if their
name is the same.  The filename *only* will be considered when
looking at the cached directories, and if hashes are not given,
users will end up with the wrong files, even when they give
unique URLs to different files.

To give a real example, suppose we have one test that needs a specific
asset file (such as an specific kernel version):

   def test_x86_64(self):
      self.fetch_asset('https://avocado-project.org/data/linux/x86_64/vmlinuz',
                       asset_hash='0123456789abcdef...')

While a second test wants the daily version of a given asset:

  def test_aarch64(self):
      self.fetch_asset('https://avocado-project.org/data/linux/aarch64/vmlinuz',
                       expire='1d')

If test_x86_64() runs first, it will write to $CACHE_DIR/vmlinuz.  The
second test, test_aarch64(), may end up having the completely wrong
file.  Using `asset_cache` is not a possibility, given that it wants
the daily kernel.

These changes put asset files on a "cache relative directory".  If a
hash is given, it's safe to put files in a directory indexed "by
name".  If not, they're put in a directory named after the URL (minus
the filename).
Signed-off-by: NCleber Rosa <crosa@redhat.com>
上级 a1855314
......@@ -17,20 +17,22 @@ Asset fetcher from multiple locations
"""
import errno
import hashlib
import logging
import os
import re
import shutil
import stat
import sys
import time
import tempfile
import time
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
from . import astring
from . import crypto
from . import path as utils_path
from .download import url_download
......@@ -102,6 +104,26 @@ class Asset(object):
"""
return '%s-CHECKSUM' % asset_file
def _get_relative_dir(self, parsed_url):
"""
When an asset has a name and a hash, there's a clear intention
for it to be unique *by name*, overwriting it if the file is
corrupted or expired. These will be stored in the cache directory
indexed by name.
When an asset does not have a hash, they will be saved according
to their locations, so that multiple assets with the same file name,
but completely unrelated to each other, will still coexist.
"""
if self.asset_hash:
return 'by_name'
base_url = "%s://%s/%s" % (parsed_url.scheme,
parsed_url.netloc,
os.path.dirname(parsed_url.path))
base_url_hash = hashlib.new(DEFAULT_HASH_ALGORITHM,
base_url.encode(astring.ENCODING))
return os.path.join('by_location', base_url_hash.hexdigest())
def fetch(self):
"""
Fetches the asset. First tries to find the asset on the provided
......@@ -114,6 +136,7 @@ class Asset(object):
urls = []
parsed_url = urlparse.urlparse(self.name)
basename = os.path.basename(parsed_url.path)
cache_relative_dir = self._get_relative_dir(parsed_url)
# If name is actually an url, it has to be included in urls list
if parsed_url.scheme:
......@@ -122,7 +145,7 @@ class Asset(object):
# First let's search for the file in each one of the cache locations
for cache_dir in self.cache_dirs:
cache_dir = os.path.expanduser(cache_dir)
asset_file = os.path.join(cache_dir, basename)
asset_file = os.path.join(cache_dir, cache_relative_dir, basename)
# To use a cached file, it must:
# - Exists.
......@@ -142,8 +165,6 @@ class Asset(object):
# A writable cache directory is then needed. The first available
# writable cache directory will be used.
cache_dir = self._get_writable_cache_dir()
asset_file = os.path.join(cache_dir, basename)
# Now we have a writable cache_dir. Let's get the asset.
# Adding the user defined locations to the urls list:
if self.locations is not None:
......@@ -159,6 +180,11 @@ class Asset(object):
else:
raise UnsupportedProtocolError("Unsupported protocol"
": %s" % urlobj.scheme)
cache_relative_dir = self._get_relative_dir(urlobj)
asset_file = os.path.join(cache_dir, cache_relative_dir, basename)
dirname = os.path.dirname(asset_file)
if not os.path.isdir(dirname):
os.makedirs(dirname)
try:
if fetch(urlobj, asset_file):
return asset_file
......
......@@ -27,7 +27,8 @@ class TestAsset(unittest.TestCase):
locations=None,
cache_dirs=[self.cache_dir],
expire=None).fetch()
expected_tarball = os.path.join(self.cache_dir, self.assetname)
expected_tarball = os.path.join(self.cache_dir, 'by_name',
self.assetname)
self.assertEqual(foo_tarball, expected_tarball)
def test_fetch_location(self):
......@@ -37,7 +38,8 @@ class TestAsset(unittest.TestCase):
locations=[self.url],
cache_dirs=[self.cache_dir],
expire=None).fetch()
expected_tarball = os.path.join(self.cache_dir, self.assetname)
expected_tarball = os.path.join(self.cache_dir, 'by_name',
self.assetname)
self.assertEqual(foo_tarball, expected_tarball)
def test_fetch_expire(self):
......@@ -88,7 +90,9 @@ class TestAsset(unittest.TestCase):
self.assertRaises(EnvironmentError, a.fetch)
def test_fetch_lockerror(self):
with FileLock(os.path.join(self.cache_dir, self.assetname)):
dirname = os.path.join(self.cache_dir, 'by_name')
os.makedirs(dirname)
with FileLock(os.path.join(dirname, self.assetname)):
a = asset.Asset(self.url,
asset_hash=self.assethash,
algorithm='sha1',
......@@ -102,6 +106,45 @@ class TestAsset(unittest.TestCase):
None, None, None, [self.cache_dir], None)
self.assertRaises(asset.UnsupportedProtocolError, invalid.fetch)
def test_fetch_different_files(self):
"""
Checks that when different assets which happen to have the
same *filename*, are properly stored in the cache directory
and that the right one will be given to the user, no matter if
a hash is used or not.
"""
second_assetname = self.assetname
second_asset_origin_dir = tempfile.mkdtemp(dir=self.basedir)
second_asset_local_path = os.path.join(second_asset_origin_dir,
second_assetname)
second_asset_content = 'This is not your first asset content!'
with open(second_asset_local_path, 'w') as f:
f.write(second_asset_content)
second_asset_origin_url = 'file://%s' % second_asset_local_path
a1 = asset.Asset(self.url, self.assethash, 'sha1', None,
[self.cache_dir], None)
a1.fetch()
a2 = asset.Asset(second_asset_origin_url, None, None,
None, [self.cache_dir], None)
a2_path = a2.fetch()
with open(a2_path, 'r') as a2_file:
self.assertEqual(a2_file.read(), second_asset_content)
third_assetname = self.assetname
third_asset_origin_dir = tempfile.mkdtemp(dir=self.basedir)
third_asset_local_path = os.path.join(third_asset_origin_dir,
third_assetname)
third_asset_content = 'Another content!'
with open(third_asset_local_path, 'w') as f:
f.write(third_asset_content)
third_asset_origin_url = 'file://%s' % third_asset_local_path
a3 = asset.Asset(third_asset_origin_url, None, None,
None, [self.cache_dir], None)
a3_path = a3.fetch()
with open(a3_path, 'r') as a3_file:
self.assertEqual(a3_file.read(), third_asset_content)
def tearDown(self):
shutil.rmtree(self.basedir)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册