From c09354a6df29a779285426881cebbefafed7767b Mon Sep 17 00:00:00 2001
From: Andrea Frittoli <andrea.frittoli@gmail.com>
Date: Fri, 23 Oct 2020 13:07:07 +0100
Subject: [PATCH] Rewrite the link re-writing logic

The previous logic parses URLs with a regex first and then using
the markdown library. It then uses urlopen().read() to validate
links.

We use now the markdown library only to extract the list of links,
and then urlparse to deconstruct, analyse, adapt and reconstract
the link. We do not attempt to fetch links anymore, which means
that external links are not guaranteed to be working.

Absolute URLs are not changed (they may be external)
Fragments are relative to the page and do not need changes
Path only links should point to a file synced to the website
but sometimes the file may be missing (if it's not in the sync
configuration), so we follow this approach:
- prefix with base_path and check for the file locally
- if not found, prefix with base_url instead

Note that urlparse treats URLs without scheme like path only
URLs, so 'github.com' will be rewritten to base_url/github.com

Signed-off-by: Andrea Frittoli <andrea.frittoli@gmail.com>
---
 sync/sync.py                 | 138 ++++++++++++++++----------------
 sync/test-content/content.md |   0
 sync/test-content/test.txt   |   0
 sync/test_sync.py            | 147 +++++++++++++++++++----------------
 4 files changed, 151 insertions(+), 134 deletions(-)
 create mode 100644 sync/test-content/content.md
 create mode 100644 sync/test-content/test.txt

diff --git a/sync/sync.py b/sync/sync.py
index 4faac29..a22628f 100755
--- a/sync/sync.py
+++ b/sync/sync.py
@@ -26,16 +26,13 @@ from multiprocessing import Pool
 import os
 import os.path
 from pathlib import Path
-import re
 import shutil
-from urllib.request import urlopen
-from urllib.error import HTTPError
 from urllib.error import URLError
+from urllib.parse import urlparse, urljoin, urlunparse
 import wget
 
 from absl import app
 from absl import flags
-import markdown
 from jinja2 import Environment
 from jinja2 import FileSystemLoader
 from lxml import etree
@@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates'
 VAULT_DIR = './content/en/vault'
 BUCKET_NAME = 'tekton-website-assets'
 
-LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)'
-
 jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
 
 
-def transform_text(link_prefix, dest_prefix, files, url):
+def transform_text(folder, files, base_path, base_url):
     """ change every link to point to a valid relative file or absolute url """
+    files_in_folder = [f'{folder}/{f}' for f in files.values()]
 
-    logging.info(f'Running: transforming files in {dest_prefix}')
-    set_lines(dest_prefix, files, url, link_prefix)
-    logging.info(f'Completed: transformed files in {dest_prefix}')
-
-
-def transform_links(line, url, link_prefix):
-    line, is_transformed = sanitize_text(link_prefix, line)
-    links = get_links(line)
-    if is_transformed:
-        for link in links:
-            link = link.get("href")
-            if not(os.path.isfile(link) or is_url(link) or is_ref(link)):
-                line = line.replace(link, github_link(url, link))
-    print(line)
-
-
-def set_lines(dest_prefix, files, url, link_prefix):
-    """ get all the text from the files and replace
-    each line of text with the list lines """
-    dest_files = [f'{dest_prefix}/{f}' for f in files.values()]
-
-    def process_file(dest_file):
-        for line in fileinput.input(dest_file, inplace=1):
+    def process_file(file_in_folder):
+        for line in fileinput.input(file_in_folder, inplace=1):
             # add a line of text to the payload
             # transform_links mutates text and set the lines provided
-            transform_links(line, url, link_prefix)
-
-    with Pool() as pool:
-        pool.imap_unordered(process_file, dest_files)
+            print(transform_line(line, folder, base_path, base_url))
 
+    for file_in_folder in files_in_folder:
+        process_file(file_in_folder)
 
-def github_link(url, link):
-    """ given a github raw link convert it to the main github link """
-    return f'{url.replace("raw", "tree", 1)}/{link}'
 
-
-def sanitize_text(link_prefix, text):
-    """ santize every line of text to exclude relative
-    links and to turn markdown file URL's to html """
-    old_line = text.rstrip()
-    new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line)
-    return new_line, old_line == new_line
-
-
-def is_url(url):
-    """ check if it is a valid url """
-    try:
-        urlopen(url).read()
-    except (HTTPError, URLError):
-        return True
-    except ValueError:
-        return False
-
-    return True
-
-
-def is_ref(url):
+def transform_line(line, base_path, rewrite_path, rewrite_url):
+    """ transform all the links in one line """
+    line = line.rstrip()
+    links = get_links(line)
+    # If there are links in this line we may need to fix them
+    for link in links:
+        # link contains the text and href
+        href =link.get("href")
+        href_mod = transform_link(href, base_path, rewrite_path, rewrite_url)
+        line = line.replace(href, href_mod)
+    return line
+
+
+def transform_link(link, base_path, rewrite_path, rewrite_url):
+    """ Transform hrefs to be valid URLs on the web-site
+
+    Absolute URLs are not changed (they may be external)
+    Fragments are relative to the page and do not need changes
+    Path only links should point to a file synced to the website
+    but sometimes the file may be missing (if it's not in the sync
+    configuration), so we follow this approach:
+    - prefix with base_path and check for the file locally
+    - if not found, prefix with base_url instead
+
+    Note that urlparse treats URLs without scheme like path only
+    URLs, so 'github.com' will be rewritten to base_url/github.com
+    """
+
+    # ignore empty links
+    if not link:
+        return link
+    # urlparse returns a named tuple
+    parsed = urlparse(link)
+    if is_absolute_url(parsed) or is_fragment(parsed):
+        return link
+    path = os.path.normpath(parsed.path)
+    if os.path.isfile(os.path.join(base_path, path)):
+        filename, ext = os.path.splitext(path)
+        # md files links are in the format .../[md filename]/
+        if ext == '.md':
+            path = filename + '/'
+        return urlunparse(parsed._replace(path="/".join([rewrite_path, path])))
+    # when not found on disk, append to the base_url
+    return urljoin(rewrite_url, link)
+
+
+def is_absolute_url(parsed_url):
+    """ check if it is an absolute url """
+    return all([parsed_url.scheme, parsed_url.netloc])
+
+
+def is_fragment(parsed_url):
     """ determine if the url is an a link """
-    if not url:
-        return False
-
-    return url[0] == "#"
+    return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1])
 
 
 def get_links(md):
@@ -146,7 +144,7 @@ def download_file(src_url, dest_path):
     os.makedirs(os.path.dirname(dest_path), exist_ok=True)
     logging.info(f'Downloading {src_url} to {dest_path}...\n')
     try:
-        wget.download(src_url, out=dest_path)
+        wget.download(src_url, out=dest_path, bar=None)
     except (FileExistsError, URLError):
         raise Exception(f'download failed for {src_url}')
 
@@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list):
         doc_directory = remove_ending_forward_slash(entry['docDirectory'])
 
         for index, tag in enumerate(entry['tags']):
-            host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}'
+            logging.info(f'Syncing {component}@{tag["name"]}')
+            download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}'
+            link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}'
             if index == 0:
                 # first links belongs on the home page
-                download_dir = f'/docs/{component}/'
+                download_dir = f'/docs/{component}'.lower()
                 site_dir = f'{CONTENT_DIR}/{component}'
             else:
                 # the other links belong in the other versions a.k.a vault
-                download_dir = f'/vault/{component}-{tag["displayName"]}/'
+                download_dir = f'/vault/{component}-{tag["displayName"]}'
                 site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}'
 
-            download_files(host_dir, site_dir, tag["files"])
-            transform_text(download_dir, site_dir, tag["files"], host_dir)
+            download_files(download_url, site_dir, tag["files"])
+            transform_text(site_dir, tag["files"], download_dir, link_base_url)
 
 
 def get_files(path, file_type):
diff --git a/sync/test-content/content.md b/sync/test-content/content.md
new file mode 100644
index 0000000..e69de29
diff --git a/sync/test-content/test.txt b/sync/test-content/test.txt
new file mode 100644
index 0000000..e69de29
diff --git a/sync/test_sync.py b/sync/test_sync.py
index f5601c0..ed45071 100644
--- a/sync/test_sync.py
+++ b/sync/test_sync.py
@@ -19,17 +19,14 @@ import tempfile
 import shutil
 import ntpath
 import os
+from shutil import copytree
+from urllib.parse import urlparse
 
-from sync import get_links
-from sync import transform_text
-from sync import is_url
-from sync import is_ref
-from sync import remove_ending_forward_slash
-from sync import get_tags
-from sync import download_files
-from sync import load_config
-from sync import save_config
-from sync import get_files
+from sync import (
+    get_links, transform_text, is_absolute_url,
+    is_fragment, remove_ending_forward_slash,
+    get_tags, download_files, load_config, save_config,
+    get_files, transform_link)
 
 
 class TestSync(unittest.TestCase):
@@ -48,22 +45,12 @@ class TestSync(unittest.TestCase):
         return text
 
     # Tests
-
-    def test_multiple_get_links(self):
-        """ This will ensure that get links will
-        return a list of multiple md links """
-        expected = ["www.link.com", "./link"]
-        result = get_links("this is a [link](www.link.com) and [link](./link)")
-
-        for index, link in enumerate(result):
-            self.assertEqual(link.get("href"), expected[index])
-
-    def test_is_ref(self):
+    def test_is_fragment(self):
         """ Verify if a string is a reference. A reference is
         defined as  a string where its first character is a hashtag """
-        self.assertEqual(is_ref(""), False)
-        self.assertEqual(is_ref("#footer"), True)
-        self.assertEqual(is_ref("www.google.com"), False)
+        self.assertFalse(is_fragment(urlparse("")))
+        self.assertTrue(is_fragment(urlparse("#footer")))
+        self.assertFalse(is_fragment(urlparse("www.google.com")))
 
     def test_remove_ending_forward_slash(self):
         """ Remove a slash if it is the last character in a string """
@@ -160,57 +147,87 @@ class TestSync(unittest.TestCase):
         expected = get_links("[link](www.link.com) this is a link")
         self.assertEqual(actual, expected[0].get("href"))
 
-    def test_is_url(self):
-        """This will return a test to see if the link is a valid url format"""
-        expected = is_url("http://www.fake.g00gl3.com")
-        self.assertEqual(True, expected)
-
-        expected = is_url("http://www.google.com")
-        self.assertEqual(True, expected)
-
-        expected = is_url("http://www.github.com")
-        self.assertEqual(True, expected)
+    def test_multiple_get_links(self):
+        """ This will ensure that get links will
+        return a list of multiple md links """
+        expected = ["www.link.com", "./link"]
+        result = get_links("this is a [link](www.link.com) and [link](./link)")
 
-        expected = is_url("./sync.py")
-        self.assertEqual(False, expected)
+        for index, link in enumerate(result):
+            self.assertEqual(link.get("href"), expected[index])
 
-        expected = is_url("www.github.com")
-        self.assertEqual(False, expected)
+    def test_is_absolute_url(self):
+        """This will return a test to see if the link is a valid url format"""
+        self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com")))
+        self.assertTrue(is_absolute_url(urlparse("http://www.google.com")))
+        self.assertFalse(is_absolute_url(urlparse("www.google.com")))
+        self.assertFalse(is_absolute_url(urlparse(".sync.py")))
+        self.assertFalse(is_absolute_url(urlparse("#fragment")))
+
+    def test_transform_link(self):
+        base_path = './test-content'
+        rewrite_path = '/docs/foo'
+        rewrite_url = 'https://foo.bar'
+        self.assertEqual(
+            transform_link("", base_path, rewrite_path, rewrite_url), "")
+        self.assertEqual(
+            transform_link("http://test.com", base_path, rewrite_path, rewrite_url),
+            "http://test.com")
+        self.assertEqual(
+            transform_link("test.txt", base_path, rewrite_path, rewrite_url),
+            "/docs/foo/test.txt")
+        self.assertEqual(
+            transform_link("content.md", base_path, rewrite_path, rewrite_url),
+            "/docs/foo/content/")
+        self.assertEqual(
+            transform_link("notthere.txt", base_path, rewrite_path, rewrite_url),
+            "https://foo.bar/notthere.txt")
 
     def test_transform_text(self):
         """Ensure that transform links will turns links to
         relative github link or existing file name"""
 
-        expected = """
-        [invalid-relative-link](test.com/./adw/a/d/awdrelative)
-        [valid-relative-link](./sync.py)
-        [valid-absolute-link](www.github.com)
-        [invalid-absolute-link](https://website-invalid-random321.net)
-        [valid-ref-link](#footer)
-        """
-        text = """
-        [invalid-relative-link](./adw/a/d/awdrelative)
-        [valid-relative-link](./sync.py)
-        [valid-absolute-link](www.github.com)
-        [invalid-absolute-link](https://website-invalid-random321.net)
-        [valid-ref-link](#footer)
-        """
+        expected = (
+            "[exists-relative-link](test-content/test.txt)\n"
+            "[exists-relative-link](test-content/content/)\n"
+            "[exists-relative-link-fragment](test-content/test.txt#fragment)\n"
+            "[notfound-relative-link](http://test.com/this/is/not/found)\n"
+            "[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n"
+            "[invalid-absolute-link](http://test.com/www.github.com)\n"
+            "[valid-absolute-link](https://website-invalid-random321.net) "
+            "[valid-ref-link](#footer)"
+        )
+        text = (
+            "[exists-relative-link](./test.txt)\n"
+            "[exists-relative-link](./content.md)\n"
+            "[exists-relative-link-fragment](test.txt#fragment)\n"
+            "[notfound-relative-link](./this/is/not/found)\n"
+            "[notfound-relative-link-fragment](./this/is/not/found#fragment)\n"
+            "[invalid-absolute-link](www.github.com)\n"
+            "[valid-absolute-link](https://website-invalid-random321.net) "
+            "[valid-ref-link](#footer)"
+        )
 
-        actual = None
-        tmp_name = None
+        content_file = "content.md"
 
         # write to file
-        with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp:
-            tmp_name = tmp.name
-            name = self.path_leaf(tmp_name)
-            tmp.write(text.strip().encode())
-
-        # mutate file
-        transform_text("", "/tmp", {name: name}, "test.com")
-        # read and delete file
-        actual = self.read_and_delete_file(tmp_name)
-
-        self.assertEqual(actual.strip(), expected.strip())
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with open(os.path.join(tmpdirname, content_file), 'w+') as content:
+                content.write(text.strip())
+            with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test:
+                test.write(text.strip())
+
+            # mutate file
+            transform_text(folder=tmpdirname,
+                           files={content_file: content_file},
+                           base_path="test-content",
+                           base_url="http://test.com")
+            # read the result
+            actual = ""
+            with open(os.path.join(tmpdirname, content_file), 'r') as result:
+                actual = result.read()
+
+            self.assertEqual(actual.strip(), expected.strip())
 
 
 if __name__ == '__main__':
-- 
GitLab