From 2862e1227a592e3c79413019ac15eaa83eace30a Mon Sep 17 00:00:00 2001
From: Amador Pahim <apahim@redhat.com>
Date: Fri, 19 Feb 2016 15:28:00 -0200
Subject: [PATCH] avocado.utils asset fetcher

Find for files in multiple locations, caching it when successfully fetched.
Available as fetch_asset() method in avocado.Test().

Reference: https://trello.com/c/KTeMIx0u
Signed-off-by: Amador Pahim <apahim@redhat.com>
---
 avocado/core/test.py               |  21 ++++
 avocado/utils/asset.py             | 180 +++++++++++++++++++++++++++++
 docs/source/WritingTests.rst       |  92 +++++++++++++++
 etc/avocado/avocado.conf           |   4 +
 selftests/unit/test_utils_asset.py |  61 ++++++++++
 5 files changed, 358 insertions(+)
 create mode 100644 avocado/utils/asset.py
 create mode 100644 selftests/unit/test_utils_asset.py

diff --git a/avocado/core/test.py b/avocado/core/test.py
index fa50d025..45a3d27a 100644
--- a/avocado/core/test.py
+++ b/avocado/core/test.py
@@ -30,12 +30,14 @@ from . import data_dir
 from . import exceptions
 from . import multiplexer
 from . import sysinfo
+from ..utils import asset
 from ..utils import astring
 from ..utils import data_structures
 from ..utils import genio
 from ..utils import path as utils_path
 from ..utils import process
 from ..utils import stacktrace
+from .settings import settings
 from .version import VERSION
 
 if sys.version_info[:2] == (2, 6):
@@ -579,6 +581,25 @@ class Test(unittest.TestCase):
         """
         raise exceptions.TestSkipError(message)
 
+    def fetch_asset(self, name, asset_hash=None, algorithm='sha1',
+                    locations=None):
+        """
+        Method o call the utils.asset in order to fetch and asset file
+        supporting hash check, caching and multiple locations.
+
+        :param name: the asset filename or URL
+        :param asset_hash: asset hash (optional)
+        :param algorithm: hash algorithm (optional, defaults to sha1)
+        :param locations: list of URLs from where the asset can be
+                          fetched (optional)
+        :returns: asset file local path
+        """
+        cache_dirs = settings.get_value('datadir.paths', 'cache_dirs',
+                                        key_type=list, default=[])
+        cache_dirs.append(os.path.join(data_dir.get_data_dir(), 'cache'))
+        return asset.Asset(name, asset_hash, algorithm, locations,
+                           cache_dirs).fetch()
+
 
 class SimpleTest(Test):
 
diff --git a/avocado/utils/asset.py b/avocado/utils/asset.py
new file mode 100644
index 00000000..ec615ea5
--- /dev/null
+++ b/avocado/utils/asset.py
@@ -0,0 +1,180 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See LICENSE for more details.
+#
+# Copyright: Red Hat Inc. 2016
+# Author: Amador Pahim <apahim@redhat.com>
+
+"""
+Asset fetcher from multiple locationss
+"""
+
+import logging
+import os
+import re
+import urlparse
+
+from . import crypto
+from . import path as utils_path
+from .download import url_download
+
+
+log = logging.getLogger('avocado.test')
+
+
+class Asset(object):
+    """
+    Try to fetch/verify an asset file from multiple locations.
+    """
+
+    def __init__(self, name, asset_hash, algorithm, locations, cache_dirs):
+        """
+        Initialize the Asset() and fetches the asset file. The path for
+        the fetched file can be reached using the self.path attribute.
+
+        :param name: the asset filename. url is also supported
+        :param asset_hash: asset hash
+        :param algorithm: hash algorithm
+        :param locations: list of locations fetch asset from
+        :params cache_dirs: list of cache directories
+        """
+        self.name = name
+        self.asset_hash = asset_hash
+        self.algorithm = algorithm
+        self.locations = locations
+        self.cache_dirs = cache_dirs
+        self.nameobj = urlparse.urlparse(self.name)
+        self.basename = os.path.basename(self.nameobj.path)
+
+    def fetch(self):
+        urls = []
+
+        # If name is actually an url, it has to be included in urls list
+        if self.nameobj.scheme:
+            urls.append(self.nameobj.geturl())
+
+        # First let's find for the file in all cache locations
+        for cache_dir in self.cache_dirs:
+            cache_dir = os.path.expanduser(cache_dir)
+            self.asset_file = os.path.join(cache_dir, self.basename)
+            if self._check_file(self.asset_file, self.asset_hash, self.algorithm):
+                return self.asset_file
+
+        # If we get to this point, file is not in any cache directory
+        # and we have to download it from a location. A rw cache
+        # directory is then needed. The first rw cache directory will be
+        # used.
+        log.debug("Looking for a writable cache dir.")
+        for cache_dir in self.cache_dirs:
+            cache_dir = os.path.expanduser(cache_dir)
+            self.asset_file = os.path.join(cache_dir, self.basename)
+            if not utils_path.usable_rw_dir(cache_dir):
+                log.debug("Read-only cache dir '%s'. Skiping." %
+                          cache_dir)
+                continue
+            log.debug("Using %s as cache dir." % cache_dir)
+
+            # Adding the user defined locations to the urls list
+            if self.locations is not None:
+                for item in self.locations:
+                    urls.append(item)
+
+            for url in urls:
+                urlobj = urlparse.urlparse(url)
+                if urlobj.scheme == 'http' or urlobj.scheme == 'https':
+                    log.debug('Downloading from %s.' % url)
+                    try:
+                        url_download(url, self.asset_file)
+                    except Exception as e:
+                        log.error(e)
+                        continue
+                    if self._check_file(self.asset_file, self.asset_hash,
+                                        self.algorithm):
+                        return self.asset_file
+
+                elif urlobj.scheme == 'ftp':
+                    log.debug('Downloading from %s.' % url)
+                    try:
+                        url_download(url, self.asset_file)
+                    except Exception as e:
+                        log.error(e)
+                        continue
+                    if self._check_file(self.asset_file, self.asset_hash,
+                                        self.algorithm):
+                        return self.asset_file
+
+                elif urlobj.scheme == 'file':
+                    if os.path.isdir(urlobj.path):
+                        path = os.path.join(urlobj.path, self.name)
+                    else:
+                        path = urlobj.path
+                    log.debug('Looking for file on %s.' % path)
+                    if self._check_file(path):
+                        os.symlink(path, self.asset_file)
+                        log.debug('Symlink created %s -> %s.' %
+                                  (self.asset_file, path))
+                    else:
+                        continue
+                    if self._check_file(self.asset_file, self.asset_hash,
+                                        self.algorithm):
+                        return self.asset_file
+
+            raise EnvironmentError("Failed to fetch %s." % self.basename)
+        raise EnvironmentError("Can't find a writable cache dir.")
+
+    @staticmethod
+    def _check_file(path, filehash=None, algorithm=None):
+        """
+        Checks if file exists and verifies the hash, when the hash is
+        provided. We try first to find a hash file to verify the hash
+        against and only if the hash file is not present we compute the
+        hash.
+        """
+        if not os.path.isfile(path):
+            log.debug('Asset %s not found.' % path)
+            return False
+
+        if filehash is None:
+            return True
+
+        basename = os.path.basename(path)
+        discovered_hash = None
+        # Try to find a hashfile for the asset file
+        hashfile = '%s.%s' % (path, algorithm)
+        if os.path.isfile(hashfile):
+            with open(hashfile, 'r') as f:
+                for line in f.readlines():
+                    # md5 is 32 chars big and sha512 is 128 chars big.
+                    # others supported algorithms are between those.
+                    pattern = '[a-f0-9]{32,128} %s' % basename
+                    if re.match(pattern, line):
+                        log.debug('Hashfile found for %s.' % path)
+                        discovered_hash = line.split()[0]
+                        break
+
+        # If no hashfile, lets calculate the hash by ourselves
+        if discovered_hash is None:
+            log.debug('No hashfile found for %s. Computing hash.' %
+                      path)
+            discovered_hash = crypto.hash_file(path, algorithm=algorithm)
+
+            # Creating the hashfile for further usage.
+            log.debug('Creating hashfile %s.' % hashfile)
+            with open(hashfile, 'w') as f:
+                content = '%s %s\n' % (discovered_hash, basename)
+                f.write(content)
+
+        if filehash == discovered_hash:
+            log.debug('Asset %s verified.' % path)
+            return True
+        else:
+            log.error('Asset %s corrupted (hash expected:%s, hash found:%s).' %
+                      (path, filehash, discovered_hash))
+            return False
diff --git a/docs/source/WritingTests.rst b/docs/source/WritingTests.rst
index 17c747aa..082b2a5c 100644
--- a/docs/source/WritingTests.rst
+++ b/docs/source/WritingTests.rst
@@ -345,6 +345,98 @@ In this example, the ``test`` method just gets into the base directory of
 the compiled suite  and executes the ``./synctest`` command, with appropriate
 parameters, using :func:`avocado.utils.process.system`.
 
+Fetching asset files
+====================
+To run third party test suites as mentioned above, or for any other pourpose,
+we offer an asset fetcher as a method of Avocado Test class.
+The asset method looks for a list of directories in the ``cache_dirs`` key,
+inside the ``[datadir.paths]`` section from the configuration files. Read-only
+directories are also supported. When the asset file is not present in any of
+the provided directories, we will try to download the file from the provided
+locations, copying it to the first writable cache directory. Example::
+
+    cache_dirs = ['/usr/local/src/', '~/avocado/cache']
+
+In the example above, ``/usr/local/src/`` is a read-only directory. In that
+case, when we need to fetch the asset from the locations, it will be copied to
+the ``~/avocado/cache`` directory.
+
+If you don't provide a ``cache_dirs``, we will use the test temporary directory
+as the cache to put the fetched files. That directory is expected to be dropped
+by the end of the test. So, to take advantage of the cache feature, you have
+to configure the ``cache_dirs`` on your system.
+
+* Use case 1: no ``cache_dirs`` key in config files, only the asset name
+  provided in the full url format::
+
+    ...
+        def setUp(self):
+            stress = 'http://people.seas.harvard.edu/~apw/stress/stress-1.0.4.tar.gz'
+            tarball = self.fetch_asset(stress)
+            archive.extract(tarball, self.srcdir)
+    ...
+
+  In this case, ``fetch_asset()`` will download the file from the url provided,
+  copying it to the test temporary workdir. ``tarball`` variable  will
+  contains, for example, ``/var/tmp/avocado_BZXo2B/stress.py_Stress.test/cache/stress-1.0.4.tar.gz``.
+
+* Use case 2: Read-only cache directory provided. ``cache_dirs = ['/mnt/files']``::
+
+    ...
+        def setUp(self):
+            stress = 'http://people.seas.harvard.edu/~apw/stress/stress-1.0.4.tar.gz'
+            tarball = self.fetch_asset(stress)
+            archive.extract(tarball, self.srcdir)
+    ...
+
+  In this case, we try to find ``stress-1.0.4.tar.gz`` file in ``/mnt/files``
+  directory. If it's not there, since ``/mnt/files`` is read-only,  we will try
+  to download the asset file to the test temporary workdir.
+
+* Use case 3: Writable cache directory provided, along with a list of
+  locations. ``cache_dirs = ['~/avocado/cache']``::
+
+    ...
+        def setUp(self):
+            st_name = 'stress-1.0.4.tar.gz'
+            st_hash = 'e1533bc704928ba6e26a362452e6db8fd58b1f0b'
+            st_loc = ['http://people.seas.harvard.edu/~apw/stress/stress-1.0.4.tar.gz',
+                      'ftp://foo.bar/stress-1.0.4.tar.gz']
+            tarball = self.fetch_asset(st_name, asset_hash=st_hash,
+                                       locations=st_loc)
+            archive.extract(tarball, self.srcdir)
+    ...
+
+  In this case, we try to download ``stress-1.0.4.tar.gz`` from the provided
+  locations list (since it's not already in ``~/avocado/cache``). The hash was
+  also provided, so we will verify the hash. To do so, we first look for a
+  hashfile named ``stress-1.0.4.tar.gz.sha1`` in the same directory. If the
+  hashfile is not present we compute the hash and create the hashfile for
+  further usage.
+
+  The resulting ``tarball`` variable content will be ``~/avocado/cache/stress-1.0.4.tar.gz``.
+  An exception will take place if we fail to download or to verify the file.
+
+
+Detailing the ``fetch_asset()`` attributes:
+
+* ``name:`` The name used to name the fetched file. It can also contains a full
+  URL, that will be used as the first location to try (after serching into the
+  cache directories).
+* ``asset_hash:`` (optional) The expected file hash. If missing, we skip the
+  check. If provided, before computing the hash, we look for a hashfile to
+  verify the asset. If the hashfile is nor present, we compute the hash and
+  create the hashfile in the same cache directory for further usage.
+* ``algorithm:`` (optional) Provided hash algorithm format. Defaults to sha1.
+* ``locations:`` (optional) List of locations that will be used to try to fetch
+  the file from. The supported schemes are ``http://``, ``ftp://`` and
+  ``file://``. You're required to inform the full url to the file, including
+  the file name. The first success will skip the next locations. Notice that
+  for ``file://`` we just create a symbolic link in the cache directory,
+  pointing to the file original location.
+
+The expected ``return`` is the asset file path or an exception.
+
 Test Output Check and Output Record Mode
 ========================================
 
diff --git a/etc/avocado/avocado.conf b/etc/avocado/avocado.conf
index 4a5805e4..f3c2f131 100644
--- a/etc/avocado/avocado.conf
+++ b/etc/avocado/avocado.conf
@@ -7,6 +7,10 @@ test_dir = /usr/share/avocado/tests
 data_dir = /usr/share/avocado/data
 # You may override the specific job results directory with logs_dir
 logs_dir = ~/avocado/job-results
+# You can set a list of cache directories to be used by the avocado test
+# fetch_asset() with 'cache_dirs'. read-only cache directories are also
+# supported.
+# cache_dirs = ['~/avocado/cache', '/mnt/cache']
 
 [sysinfo.collect]
 # Whether to collect system information during avocado jobs
diff --git a/selftests/unit/test_utils_asset.py b/selftests/unit/test_utils_asset.py
new file mode 100644
index 00000000..6dea1d35
--- /dev/null
+++ b/selftests/unit/test_utils_asset.py
@@ -0,0 +1,61 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+from avocado.utils import asset
+
+
+class TestAsset(unittest.TestCase):
+
+    def setUp(self):
+        self.basedir = tempfile.mkdtemp(prefix='avocado_' + __name__)
+        self.assetdir = tempfile.mkdtemp(dir=self.basedir)
+        self.assetname = 'foo.tgz'
+        self.assethash = '3a033a8938c1af56eeb793669db83bcbd0c17ea5'
+        self.localpath = os.path.join(self.assetdir, self.assetname)
+        with open(self.localpath, 'w') as f:
+            f.write('Test!')
+        self.url = 'file://%s' % self.localpath
+        self.cache_dir = tempfile.mkdtemp(dir=self.basedir)
+
+    def testFetch_urlname(self):
+        foo_tarball = asset.Asset(self.url,
+                                  asset_hash=self.assethash,
+                                  algorithm='sha1',
+                                  locations=None,
+                                  cache_dirs=[self.cache_dir]).fetch()
+        expected_tarball = os.path.join(self.cache_dir, self.assetname)
+        self.assertEqual(foo_tarball, expected_tarball)
+        hashfile = '.'.join([expected_tarball, 'sha1'])
+        self.assertTrue(os.path.isfile(hashfile))
+        expected_content = '%s %s\n' % (self.assethash, self.assetname)
+        with open(hashfile, 'r') as f:
+            content = f.read()
+        self.assertEqual(content, expected_content)
+
+    def testFetch_location(self):
+        foo_tarball = asset.Asset(self.assetname,
+                                  asset_hash=self.assethash,
+                                  algorithm='sha1',
+                                  locations=[self.url],
+                                  cache_dirs=[self.cache_dir]).fetch()
+        expected_tarball = os.path.join(self.cache_dir, self.assetname)
+        self.assertEqual(foo_tarball, expected_tarball)
+        hashfile = '.'.join([expected_tarball, 'sha1'])
+        self.assertTrue(os.path.isfile(hashfile))
+        expected_content = '%s %s\n' % (self.assethash, self.assetname)
+        with open(hashfile, 'r') as f:
+            content = f.read()
+        self.assertEqual(content, expected_content)
+
+    def testException(self):
+        a = asset.Asset(name='bar.tgz', asset_hash=None, algorithm=None,
+                        locations=None, cache_dirs=[self.cache_dir])
+        self.assertRaises(EnvironmentError, a.fetch)
+
+    def tearDown(self):
+        shutil.rmtree(self.basedir)
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab