Asset Fetcher: avoid clashes by using namespace dirs

The current implementation fails to cache multiple files if their name is the same. The filename *only* will be considered when looking at the cached directories, and if hashes are not given, users will end up with the wrong files, even when they give unique URLs to different files. To give a real example, suppose we have one test that needs a specific asset file (such as an specific kernel version): def test_x86_64(self): self.fetch_asset('https://avocado-project.org/data/linux/x86_64/vmlinuz', asset_hash='0123456789abcdef...') While a second test wants the daily version of a given asset: def test_aarch64(self): self.fetch_asset('https://avocado-project.org/data/linux/aarch64/vmlinuz', expire='1d') If test_x86_64() runs first, it will write to $CACHE_DIR/vmlinuz. The second test, test_aarch64(), may end up having the completely wrong file. Using `asset_cache` is not a possibility, given that it wants the daily kernel. These changes put asset files on a "cache relative directory". If a hash is given, it's safe to put files in a directory indexed "by name". If not, they're put in a directory named after the URL (minus the filename). Signed-off-by: N Cleber Rosa <crosa@redhat.com>

Asset Fetcher: avoid clashes by using namespace dirs
The current implementation fails to cache multiple files if their name is the same. The filename *only* will be considered when looking at the cached directories, and if hashes are not given, users will end up with the wrong files, even when they give unique URLs to different files. To give a real example, suppose we have one test that needs a specific asset file (such as an specific kernel version): def test_x86_64(self): self.fetch_asset('https://avocado-project.org/data/linux/x86_64/vmlinuz', asset_hash='0123456789abcdef...') While a second test wants the daily version of a given asset: def test_aarch64(self): self.fetch_asset('https://avocado-project.org/data/linux/aarch64/vmlinuz', expire='1d') If test_x86_64() runs first, it will write to $CACHE_DIR/vmlinuz. The second test, test_aarch64(), may end up having the completely wrong file. Using `asset_cache` is not a possibility, given that it wants the daily kernel. These changes put asset files on a "cache relative directory". If a hash is given, it's safe to put files in a directory indexed "by name". If not, they're put in a directory named after the URL (minus the filename). Signed-off-by: N Cleber Rosa <crosa@redhat.com>
111c2a80 · Cleber Rosa · a1855314 · 111c2a80 · 111c2a80
隐藏空白更改
内联并排

Showing with 76 addition and 7 deletion

avocado/utils/asset.py avocado/utils/asset.py +30 -4

selftests/unit/test_utils_asset.py selftests/unit/test_utils_asset.py +46 -3

未找到文件。
--- a/avocado/utils/asset.py
+++ b/avocado/utils/asset.py
@@ -17,20 +17,22 @@ Asset fetcher from multiple locations
 """

 import errno
+import hashlib
 import logging
 import os
 import re
 import shutil
 import stat
 import sys
-import time
 import tempfile
+import time

 try:
    import urlparse
 except ImportError:
    import urllib.parse as urlparse

+from . import astring
 from . import crypto
 from . import path as utils_path
 from .download import url_download
@@ -102,6 +104,26 @@ class Asset(object):
        """
        return '%s-CHECKSUM' % asset_file

+    def _get_relative_dir(self, parsed_url):
+        """
+        When an asset has a name and a hash, there's a clear intention
+        for it to be unique *by name*, overwriting it if the file is
+        corrupted or expired.  These will be stored in the cache directory
+        indexed by name.
+
+        When an asset does not have a hash, they will be saved according
+        to their locations, so that multiple assets with the same file name,
+        but completely unrelated to each other, will still coexist.
+        """
+        if self.asset_hash:
+            return 'by_name'
+        base_url = "%s://%s/%s" % (parsed_url.scheme,
+                                   parsed_url.netloc,
+                                   os.path.dirname(parsed_url.path))
+        base_url_hash = hashlib.new(DEFAULT_HASH_ALGORITHM,
+                                    base_url.encode(astring.ENCODING))
+        return os.path.join('by_location', base_url_hash.hexdigest())
+
    def fetch(self):
        """
        Fetches the asset. First tries to find the asset on the provided
@@ -114,6 +136,7 @@ class Asset(object):
        urls = []
        parsed_url = urlparse.urlparse(self.name)
        basename = os.path.basename(parsed_url.path)
+        cache_relative_dir = self._get_relative_dir(parsed_url)

        # If name is actually an url, it has to be included in urls list
        if parsed_url.scheme:
@@ -122,7 +145,7 @@ class Asset(object):
        # First let's search for the file in each one of the cache locations
        for cache_dir in self.cache_dirs:
            cache_dir = os.path.expanduser(cache_dir)
-            asset_file = os.path.join(cache_dir, basename)
+            asset_file = os.path.join(cache_dir, cache_relative_dir, basename)

            # To use a cached file, it must:
            # - Exists.
@@ -142,8 +165,6 @@ class Asset(object):
        # A writable cache directory is then needed. The first available
        # writable cache directory will be used.
        cache_dir = self._get_writable_cache_dir()
-        asset_file = os.path.join(cache_dir, basename)
-
        # Now we have a writable cache_dir. Let's get the asset.
        # Adding the user defined locations to the urls list:
        if self.locations is not None:
@@ -159,6 +180,11 @@ class Asset(object):
            else:
                raise UnsupportedProtocolError("Unsupported protocol"
                                               ": %s" % urlobj.scheme)
+            cache_relative_dir = self._get_relative_dir(urlobj)
+            asset_file = os.path.join(cache_dir, cache_relative_dir, basename)
+            dirname = os.path.dirname(asset_file)
+            if not os.path.isdir(dirname):
+                os.makedirs(dirname)
            try:
                if fetch(urlobj, asset_file):
                    return asset_file

--- a/selftests/unit/test_utils_asset.py
+++ b/selftests/unit/test_utils_asset.py
@@ -27,7 +27,8 @@ class TestAsset(unittest.TestCase):
                                  locations=None,
                                  cache_dirs=[self.cache_dir],
                                  expire=None).fetch()
-        expected_tarball = os.path.join(self.cache_dir, self.assetname)
+        expected_tarball = os.path.join(self.cache_dir, 'by_name',
+                                        self.assetname)
        self.assertEqual(foo_tarball, expected_tarball)

    def test_fetch_location(self):
@@ -37,7 +38,8 @@ class TestAsset(unittest.TestCase):
                                  locations=[self.url],
                                  cache_dirs=[self.cache_dir],
                                  expire=None).fetch()
-        expected_tarball = os.path.join(self.cache_dir, self.assetname)
+        expected_tarball = os.path.join(self.cache_dir, 'by_name',
+                                        self.assetname)
        self.assertEqual(foo_tarball, expected_tarball)

    def test_fetch_expire(self):
@@ -88,7 +90,9 @@ class TestAsset(unittest.TestCase):
        self.assertRaises(EnvironmentError, a.fetch)

    def test_fetch_lockerror(self):
-        with FileLock(os.path.join(self.cache_dir, self.assetname)):
+        dirname = os.path.join(self.cache_dir, 'by_name')
+        os.makedirs(dirname)
+        with FileLock(os.path.join(dirname, self.assetname)):
            a = asset.Asset(self.url,
                            asset_hash=self.assethash,
                            algorithm='sha1',
@@ -102,6 +106,45 @@ class TestAsset(unittest.TestCase):
                              None, None, None, [self.cache_dir], None)
        self.assertRaises(asset.UnsupportedProtocolError, invalid.fetch)

+    def test_fetch_different_files(self):
+        """
+        Checks that when different assets which happen to have the
+        same *filename*, are properly stored in the cache directory
+        and that the right one will be given to the user, no matter if
+        a hash is used or not.
+        """
+        second_assetname = self.assetname
+        second_asset_origin_dir = tempfile.mkdtemp(dir=self.basedir)
+        second_asset_local_path = os.path.join(second_asset_origin_dir,
+                                               second_assetname)
+        second_asset_content = 'This is not your first asset content!'
+        with open(second_asset_local_path, 'w') as f:
+            f.write(second_asset_content)
+        second_asset_origin_url = 'file://%s' % second_asset_local_path
+
+        a1 = asset.Asset(self.url, self.assethash, 'sha1', None,
+                         [self.cache_dir], None)
+        a1.fetch()
+        a2 = asset.Asset(second_asset_origin_url, None, None,
+                         None, [self.cache_dir], None)
+        a2_path = a2.fetch()
+        with open(a2_path, 'r') as a2_file:
+            self.assertEqual(a2_file.read(), second_asset_content)
+
+        third_assetname = self.assetname
+        third_asset_origin_dir = tempfile.mkdtemp(dir=self.basedir)
+        third_asset_local_path = os.path.join(third_asset_origin_dir,
+                                              third_assetname)
+        third_asset_content = 'Another content!'
+        with open(third_asset_local_path, 'w') as f:
+            f.write(third_asset_content)
+        third_asset_origin_url = 'file://%s' % third_asset_local_path
+        a3 = asset.Asset(third_asset_origin_url, None, None,
+                         None, [self.cache_dir], None)
+        a3_path = a3.fetch()
+        with open(a3_path, 'r') as a3_file:
+            self.assertEqual(a3_file.read(), third_asset_content)
+
    def tearDown(self):
        shutil.rmtree(self.basedir)