Rewrite the link re-writing logic

The previous logic parses URLs with a regex first and then using the markdown library. It then uses urlopen().read() to validate links. We use now the markdown library only to extract the list of links, and then urlparse to deconstruct, analyse, adapt and reconstract the link. We do not attempt to fetch links anymore, which means that external links are not guaranteed to be working. Absolute URLs are not changed (they may be external) Fragments are relative to the page and do not need changes Path only links should point to a file synced to the website but sometimes the file may be missing (if it's not in the sync configuration), so we follow this approach: - prefix with base_path and check for the file locally - if not found, prefix with base_url instead Note that urlparse treats URLs without scheme like path only URLs, so 'github.com' will be rewritten to base_url/github.com Signed-off-by: N Andrea Frittoli <andrea.frittoli@gmail.com>

Rewrite the link re-writing logic
The previous logic parses URLs with a regex first and then using the markdown library. It then uses urlopen().read() to validate links. We use now the markdown library only to extract the list of links, and then urlparse to deconstruct, analyse, adapt and reconstract the link. We do not attempt to fetch links anymore, which means that external links are not guaranteed to be working. Absolute URLs are not changed (they may be external) Fragments are relative to the page and do not need changes Path only links should point to a file synced to the website but sometimes the file may be missing (if it's not in the sync configuration), so we follow this approach: - prefix with base_path and check for the file locally - if not found, prefix with base_url instead Note that urlparse treats URLs without scheme like path only URLs, so 'github.com' will be rewritten to base_url/github.com Signed-off-by: N Andrea Frittoli <andrea.frittoli@gmail.com>
c09354a6 · Andrea Frittoli · tekton-robot · d1eca436 · c09354a6 · c09354a6
Showing with 151 addition and 134 deletion

sync/sync.py sync/sync.py +69 -69

sync/test-content/content.md sync/test-content/content.md +0 -0

sync/test-content/test.txt sync/test-content/test.txt +0 -0

sync/test_sync.py sync/test_sync.py +82 -65

未找到文件。
--- a/sync/sync.py
+++ b/sync/sync.py
@@ -26,16 +26,13 @@ from multiprocessing import Pool
 import os
 import os.path
 from pathlib import Path
-import re
 import shutil
-from urllib.request import urlopen
-from urllib.error import HTTPError
 from urllib.error import URLError
+from urllib.parse import urlparse, urljoin, urlunparse
 import wget
 from absl import app
 from absl import flags
-import markdown
 from jinja2 import Environment
 from jinja2 import FileSystemLoader
 from lxml import etree
@@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates'
 VAULT_DIR = './content/en/vault'
 BUCKET_NAME = 'tekton-website-assets'
-LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)'
 jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
-def transform_text(link_prefix, dest_prefix, files, url):
+def transform_text(folder, files, base_path, base_url):
    """ change every link to point to a valid relative file or absolute url """
+    files_in_folder = [f'{folder}/{f}' for f in files.values()]
-    logging.info(f'Running: transforming files in {dest_prefix}')
+    def process_file(file_in_folder):
-    set_lines(dest_prefix, files, url, link_prefix)
+        for line in fileinput.input(file_in_folder, inplace=1):
-    logging.info(f'Completed: transformed files in {dest_prefix}')
-def transform_links(line, url, link_prefix):
-    line, is_transformed = sanitize_text(link_prefix, line)
-    links = get_links(line)
-    if is_transformed:
-        for link in links:
-            link = link.get("href")
-            if not(os.path.isfile(link) or is_url(link) or is_ref(link)):
-                line = line.replace(link, github_link(url, link))
-    print(line)
-def set_lines(dest_prefix, files, url, link_prefix):
-    """ get all the text from the files and replace
-    each line of text with the list lines """
-    dest_files = [f'{dest_prefix}/{f}' for f in files.values()]
-    def process_file(dest_file):
-        for line in fileinput.input(dest_file, inplace=1):
            # add a line of text to the payload
            # transform_links mutates text and set the lines provided
-            transform_links(line, url, link_prefix)
+            print(transform_line(line, folder, base_path, base_url))
-    with Pool() as pool:
-        pool.imap_unordered(process_file, dest_files)
+    for file_in_folder in files_in_folder:
+        process_file(file_in_folder)
-def github_link(url, link):
-    """ given a github raw link convert it to the main github link """
-    return f'{url.replace("raw", "tree", 1)}/{link}'
+def transform_line(line, base_path, rewrite_path, rewrite_url):
-def sanitize_text(link_prefix, text):
+    """ transform all the links in one line """
-    """ santize every line of text to exclude relative
+    line = line.rstrip()
-    links and to turn markdown file URL's to html """
+    links = get_links(line)
-    old_line = text.rstrip()
+    # If there are links in this line we may need to fix them
-    new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line)
+    for link in links:
-    return new_line, old_line == new_line
+        # link contains the text and href
+        href =link.get("href")
+        href_mod = transform_link(href, base_path, rewrite_path, rewrite_url)
-def is_url(url):
+        line = line.replace(href, href_mod)
-    """ check if it is a valid url """
+    return line
-    try:
-        urlopen(url).read()
-    except (HTTPError, URLError):
+def transform_link(link, base_path, rewrite_path, rewrite_url):
-        return True
+    """ Transform hrefs to be valid URLs on the web-site
-    except ValueError:
-        return False
+    Absolute URLs are not changed (they may be external)
+    Fragments are relative to the page and do not need changes
-    return True
+    Path only links should point to a file synced to the website
+    but sometimes the file may be missing (if it's not in the sync
+    configuration), so we follow this approach:
-def is_ref(url):
+    - prefix with base_path and check for the file locally
+    - if not found, prefix with base_url instead
+    Note that urlparse treats URLs without scheme like path only
+    URLs, so 'github.com' will be rewritten to base_url/github.com
+    """
+    # ignore empty links
+    if not link:
+        return link
+    # urlparse returns a named tuple
+    parsed = urlparse(link)
+    if is_absolute_url(parsed) or is_fragment(parsed):
+        return link
+    path = os.path.normpath(parsed.path)
+    if os.path.isfile(os.path.join(base_path, path)):
+        filename, ext = os.path.splitext(path)
+        # md files links are in the format .../[md filename]/
+        if ext == '.md':
+            path = filename + '/'
+        return urlunparse(parsed._replace(path="/".join([rewrite_path, path])))
+    # when not found on disk, append to the base_url
+    return urljoin(rewrite_url, link)
+def is_absolute_url(parsed_url):
+    """ check if it is an absolute url """
+    return all([parsed_url.scheme, parsed_url.netloc])
+def is_fragment(parsed_url):
    """ determine if the url is an a link """
-    if not url:
+    return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1])
-        return False
-    return url[0] == "#"
 def get_links(md):
@@ -146,7 +144,7 @@ def download_file(src_url, dest_path):
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    logging.info(f'Downloading {src_url} to {dest_path}...\n')
    try:
-        wget.download(src_url, out=dest_path)
+        wget.download(src_url, out=dest_path, bar=None)
    except (FileExistsError, URLError):
        raise Exception(f'download failed for {src_url}')
@@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list):
        doc_directory = remove_ending_forward_slash(entry['docDirectory'])
        for index, tag in enumerate(entry['tags']):
-            host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}'
+            logging.info(f'Syncing {component}@{tag["name"]}')
+            download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}'
+            link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}'
            if index == 0:
                # first links belongs on the home page
-                download_dir = f'/docs/{component}/'
+                download_dir = f'/docs/{component}'.lower()
                site_dir = f'{CONTENT_DIR}/{component}'
            else:
                # the other links belong in the other versions a.k.a vault
-                download_dir = f'/vault/{component}-{tag["displayName"]}/'
+                download_dir = f'/vault/{component}-{tag["displayName"]}'
                site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}'
-            download_files(host_dir, site_dir, tag["files"])
+            download_files(download_url, site_dir, tag["files"])
-            transform_text(download_dir, site_dir, tag["files"], host_dir)
+            transform_text(site_dir, tag["files"], download_dir, link_base_url)
 def get_files(path, file_type):

--- a/sync/test-content/content.md
+++ b/sync/test-content/content.md
--- a/sync/test-content/test.txt
+++ b/sync/test-content/test.txt
--- a/sync/test_sync.py
+++ b/sync/test_sync.py
@@ -19,17 +19,14 @@ import tempfile
 import shutil
 import ntpath
 import os
+from shutil import copytree
+from urllib.parse import urlparse
-from sync import get_links
+from sync import (
-from sync import transform_text
+    get_links, transform_text, is_absolute_url,
-from sync import is_url
+    is_fragment, remove_ending_forward_slash,
-from sync import is_ref
+    get_tags, download_files, load_config, save_config,
-from sync import remove_ending_forward_slash
+    get_files, transform_link)
-from sync import get_tags
-from sync import download_files
-from sync import load_config
-from sync import save_config
-from sync import get_files
 class TestSync(unittest.TestCase):
@@ -48,22 +45,12 @@ class TestSync(unittest.TestCase):
        return text
    # Tests
+    def test_is_fragment(self):
-    def test_multiple_get_links(self):
-        """ This will ensure that get links will
-        return a list of multiple md links """
-        expected = ["www.link.com", "./link"]
-        result = get_links("this is a [link](www.link.com) and [link](./link)")
-        for index, link in enumerate(result):
-            self.assertEqual(link.get("href"), expected[index])
-    def test_is_ref(self):
        """ Verify if a string is a reference. A reference is
        defined as  a string where its first character is a hashtag """
-        self.assertEqual(is_ref(""), False)
+        self.assertFalse(is_fragment(urlparse("")))
-        self.assertEqual(is_ref("#footer"), True)
+        self.assertTrue(is_fragment(urlparse("#footer")))
-        self.assertEqual(is_ref("www.google.com"), False)
+        self.assertFalse(is_fragment(urlparse("www.google.com")))
    def test_remove_ending_forward_slash(self):
        """ Remove a slash if it is the last character in a string """
@@ -160,57 +147,87 @@ class TestSync(unittest.TestCase):
        expected = get_links("[link](www.link.com) this is a link")
        self.assertEqual(actual, expected[0].get("href"))
-    def test_is_url(self):
+    def test_multiple_get_links(self):
-        """This will return a test to see if the link is a valid url format"""
+        """ This will ensure that get links will
-        expected = is_url("http://www.fake.g00gl3.com")
+        return a list of multiple md links """
-        self.assertEqual(True, expected)
+        expected = ["www.link.com", "./link"]
+        result = get_links("this is a [link](www.link.com) and [link](./link)")
-        expected = is_url("http://www.google.com")
-        self.assertEqual(True, expected)
-        expected = is_url("http://www.github.com")
-        self.assertEqual(True, expected)
-        expected = is_url("./sync.py")
+        for index, link in enumerate(result):
-        self.assertEqual(False, expected)
+            self.assertEqual(link.get("href"), expected[index])
-        expected = is_url("www.github.com")
+    def test_is_absolute_url(self):
-        self.assertEqual(False, expected)
+        """This will return a test to see if the link is a valid url format"""
+        self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com")))
+        self.assertTrue(is_absolute_url(urlparse("http://www.google.com")))
+        self.assertFalse(is_absolute_url(urlparse("www.google.com")))
+        self.assertFalse(is_absolute_url(urlparse(".sync.py")))
+        self.assertFalse(is_absolute_url(urlparse("#fragment")))
+    def test_transform_link(self):
+        base_path = './test-content'
+        rewrite_path = '/docs/foo'
+        rewrite_url = 'https://foo.bar'
+        self.assertEqual(
+            transform_link("", base_path, rewrite_path, rewrite_url), "")
+        self.assertEqual(
+            transform_link("http://test.com", base_path, rewrite_path, rewrite_url),
+            "http://test.com")
+        self.assertEqual(
+            transform_link("test.txt", base_path, rewrite_path, rewrite_url),
+            "/docs/foo/test.txt")
+        self.assertEqual(
+            transform_link("content.md", base_path, rewrite_path, rewrite_url),
+            "/docs/foo/content/")
+        self.assertEqual(
+            transform_link("notthere.txt", base_path, rewrite_path, rewrite_url),
+            "https://foo.bar/notthere.txt")
    def test_transform_text(self):
        """Ensure that transform links will turns links to
        relative github link or existing file name"""
-        expected = """
+        expected = (
-        [invalid-relative-link](test.com/./adw/a/d/awdrelative)
+            "[exists-relative-link](test-content/test.txt)\n"
-        [valid-relative-link](./sync.py)
+            "[exists-relative-link](test-content/content/)\n"
-        [valid-absolute-link](www.github.com)
+            "[exists-relative-link-fragment](test-content/test.txt#fragment)\n"
-        [invalid-absolute-link](https://website-invalid-random321.net)
+            "[notfound-relative-link](http://test.com/this/is/not/found)\n"
-        [valid-ref-link](#footer)
+            "[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n"
-        """
+            "[invalid-absolute-link](http://test.com/www.github.com)\n"
-        text = """
+            "[valid-absolute-link](https://website-invalid-random321.net) "
-        [invalid-relative-link](./adw/a/d/awdrelative)
+            "[valid-ref-link](#footer)"
-        [valid-relative-link](./sync.py)
+        )
-        [valid-absolute-link](www.github.com)
+        text = (
-        [invalid-absolute-link](https://website-invalid-random321.net)
+            "[exists-relative-link](./test.txt)\n"
-        [valid-ref-link](#footer)
+            "[exists-relative-link](./content.md)\n"
-        """
+            "[exists-relative-link-fragment](test.txt#fragment)\n"
+            "[notfound-relative-link](./this/is/not/found)\n"
+            "[notfound-relative-link-fragment](./this/is/not/found#fragment)\n"
+            "[invalid-absolute-link](www.github.com)\n"
+            "[valid-absolute-link](https://website-invalid-random321.net) "
+            "[valid-ref-link](#footer)"
+        )
-        actual = None
+        content_file = "content.md"
-        tmp_name = None
        # write to file
-        with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp:
+        with tempfile.TemporaryDirectory() as tmpdirname:
-            tmp_name = tmp.name
+            with open(os.path.join(tmpdirname, content_file), 'w+') as content:
-            name = self.path_leaf(tmp_name)
+                content.write(text.strip())
-            tmp.write(text.strip().encode())
+            with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test:
+                test.write(text.strip())
-        # mutate file
-        transform_text("", "/tmp", {name: name}, "test.com")
+            # mutate file
-        # read and delete file
+            transform_text(folder=tmpdirname,
-        actual = self.read_and_delete_file(tmp_name)
+                           files={content_file: content_file},
+                           base_path="test-content",
-        self.assertEqual(actual.strip(), expected.strip())
+                           base_url="http://test.com")
+            # read the result
+            actual = ""
+            with open(os.path.join(tmpdirname, content_file), 'r') as result:
+                actual = result.read()
+            self.assertEqual(actual.strip(), expected.strip())
 if __name__ == '__main__':