From c09354a6df29a779285426881cebbefafed7767b Mon Sep 17 00:00:00 2001 From: Andrea Frittoli Date: Fri, 23 Oct 2020 13:07:07 +0100 Subject: [PATCH] Rewrite the link re-writing logic The previous logic parses URLs with a regex first and then using the markdown library. It then uses urlopen().read() to validate links. We use now the markdown library only to extract the list of links, and then urlparse to deconstruct, analyse, adapt and reconstract the link. We do not attempt to fetch links anymore, which means that external links are not guaranteed to be working. Absolute URLs are not changed (they may be external) Fragments are relative to the page and do not need changes Path only links should point to a file synced to the website but sometimes the file may be missing (if it's not in the sync configuration), so we follow this approach: - prefix with base_path and check for the file locally - if not found, prefix with base_url instead Note that urlparse treats URLs without scheme like path only URLs, so 'github.com' will be rewritten to base_url/github.com Signed-off-by: Andrea Frittoli --- sync/sync.py | 138 ++++++++++++++++---------------- sync/test-content/content.md | 0 sync/test-content/test.txt | 0 sync/test_sync.py | 147 +++++++++++++++++++---------------- 4 files changed, 151 insertions(+), 134 deletions(-) create mode 100644 sync/test-content/content.md create mode 100644 sync/test-content/test.txt diff --git a/sync/sync.py b/sync/sync.py index 4faac29..a22628f 100755 --- a/sync/sync.py +++ b/sync/sync.py @@ -26,16 +26,13 @@ from multiprocessing import Pool import os import os.path from pathlib import Path -import re import shutil -from urllib.request import urlopen -from urllib.error import HTTPError from urllib.error import URLError +from urllib.parse import urlparse, urljoin, urlunparse import wget from absl import app from absl import flags -import markdown from jinja2 import Environment from jinja2 import FileSystemLoader from lxml import etree @@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates' VAULT_DIR = './content/en/vault' BUCKET_NAME = 'tekton-website-assets' -LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)' - jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR)) -def transform_text(link_prefix, dest_prefix, files, url): +def transform_text(folder, files, base_path, base_url): """ change every link to point to a valid relative file or absolute url """ + files_in_folder = [f'{folder}/{f}' for f in files.values()] - logging.info(f'Running: transforming files in {dest_prefix}') - set_lines(dest_prefix, files, url, link_prefix) - logging.info(f'Completed: transformed files in {dest_prefix}') - - -def transform_links(line, url, link_prefix): - line, is_transformed = sanitize_text(link_prefix, line) - links = get_links(line) - if is_transformed: - for link in links: - link = link.get("href") - if not(os.path.isfile(link) or is_url(link) or is_ref(link)): - line = line.replace(link, github_link(url, link)) - print(line) - - -def set_lines(dest_prefix, files, url, link_prefix): - """ get all the text from the files and replace - each line of text with the list lines """ - dest_files = [f'{dest_prefix}/{f}' for f in files.values()] - - def process_file(dest_file): - for line in fileinput.input(dest_file, inplace=1): + def process_file(file_in_folder): + for line in fileinput.input(file_in_folder, inplace=1): # add a line of text to the payload # transform_links mutates text and set the lines provided - transform_links(line, url, link_prefix) - - with Pool() as pool: - pool.imap_unordered(process_file, dest_files) + print(transform_line(line, folder, base_path, base_url)) + for file_in_folder in files_in_folder: + process_file(file_in_folder) -def github_link(url, link): - """ given a github raw link convert it to the main github link """ - return f'{url.replace("raw", "tree", 1)}/{link}' - -def sanitize_text(link_prefix, text): - """ santize every line of text to exclude relative - links and to turn markdown file URL's to html """ - old_line = text.rstrip() - new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line) - return new_line, old_line == new_line - - -def is_url(url): - """ check if it is a valid url """ - try: - urlopen(url).read() - except (HTTPError, URLError): - return True - except ValueError: - return False - - return True - - -def is_ref(url): +def transform_line(line, base_path, rewrite_path, rewrite_url): + """ transform all the links in one line """ + line = line.rstrip() + links = get_links(line) + # If there are links in this line we may need to fix them + for link in links: + # link contains the text and href + href =link.get("href") + href_mod = transform_link(href, base_path, rewrite_path, rewrite_url) + line = line.replace(href, href_mod) + return line + + +def transform_link(link, base_path, rewrite_path, rewrite_url): + """ Transform hrefs to be valid URLs on the web-site + + Absolute URLs are not changed (they may be external) + Fragments are relative to the page and do not need changes + Path only links should point to a file synced to the website + but sometimes the file may be missing (if it's not in the sync + configuration), so we follow this approach: + - prefix with base_path and check for the file locally + - if not found, prefix with base_url instead + + Note that urlparse treats URLs without scheme like path only + URLs, so 'github.com' will be rewritten to base_url/github.com + """ + + # ignore empty links + if not link: + return link + # urlparse returns a named tuple + parsed = urlparse(link) + if is_absolute_url(parsed) or is_fragment(parsed): + return link + path = os.path.normpath(parsed.path) + if os.path.isfile(os.path.join(base_path, path)): + filename, ext = os.path.splitext(path) + # md files links are in the format .../[md filename]/ + if ext == '.md': + path = filename + '/' + return urlunparse(parsed._replace(path="/".join([rewrite_path, path]))) + # when not found on disk, append to the base_url + return urljoin(rewrite_url, link) + + +def is_absolute_url(parsed_url): + """ check if it is an absolute url """ + return all([parsed_url.scheme, parsed_url.netloc]) + + +def is_fragment(parsed_url): """ determine if the url is an a link """ - if not url: - return False - - return url[0] == "#" + return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1]) def get_links(md): @@ -146,7 +144,7 @@ def download_file(src_url, dest_path): os.makedirs(os.path.dirname(dest_path), exist_ok=True) logging.info(f'Downloading {src_url} to {dest_path}...\n') try: - wget.download(src_url, out=dest_path) + wget.download(src_url, out=dest_path, bar=None) except (FileExistsError, URLError): raise Exception(f'download failed for {src_url}') @@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list): doc_directory = remove_ending_forward_slash(entry['docDirectory']) for index, tag in enumerate(entry['tags']): - host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}' + logging.info(f'Syncing {component}@{tag["name"]}') + download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}' + link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}' if index == 0: # first links belongs on the home page - download_dir = f'/docs/{component}/' + download_dir = f'/docs/{component}'.lower() site_dir = f'{CONTENT_DIR}/{component}' else: # the other links belong in the other versions a.k.a vault - download_dir = f'/vault/{component}-{tag["displayName"]}/' + download_dir = f'/vault/{component}-{tag["displayName"]}' site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}' - download_files(host_dir, site_dir, tag["files"]) - transform_text(download_dir, site_dir, tag["files"], host_dir) + download_files(download_url, site_dir, tag["files"]) + transform_text(site_dir, tag["files"], download_dir, link_base_url) def get_files(path, file_type): diff --git a/sync/test-content/content.md b/sync/test-content/content.md new file mode 100644 index 0000000..e69de29 diff --git a/sync/test-content/test.txt b/sync/test-content/test.txt new file mode 100644 index 0000000..e69de29 diff --git a/sync/test_sync.py b/sync/test_sync.py index f5601c0..ed45071 100644 --- a/sync/test_sync.py +++ b/sync/test_sync.py @@ -19,17 +19,14 @@ import tempfile import shutil import ntpath import os +from shutil import copytree +from urllib.parse import urlparse -from sync import get_links -from sync import transform_text -from sync import is_url -from sync import is_ref -from sync import remove_ending_forward_slash -from sync import get_tags -from sync import download_files -from sync import load_config -from sync import save_config -from sync import get_files +from sync import ( + get_links, transform_text, is_absolute_url, + is_fragment, remove_ending_forward_slash, + get_tags, download_files, load_config, save_config, + get_files, transform_link) class TestSync(unittest.TestCase): @@ -48,22 +45,12 @@ class TestSync(unittest.TestCase): return text # Tests - - def test_multiple_get_links(self): - """ This will ensure that get links will - return a list of multiple md links """ - expected = ["www.link.com", "./link"] - result = get_links("this is a [link](www.link.com) and [link](./link)") - - for index, link in enumerate(result): - self.assertEqual(link.get("href"), expected[index]) - - def test_is_ref(self): + def test_is_fragment(self): """ Verify if a string is a reference. A reference is defined as a string where its first character is a hashtag """ - self.assertEqual(is_ref(""), False) - self.assertEqual(is_ref("#footer"), True) - self.assertEqual(is_ref("www.google.com"), False) + self.assertFalse(is_fragment(urlparse(""))) + self.assertTrue(is_fragment(urlparse("#footer"))) + self.assertFalse(is_fragment(urlparse("www.google.com"))) def test_remove_ending_forward_slash(self): """ Remove a slash if it is the last character in a string """ @@ -160,57 +147,87 @@ class TestSync(unittest.TestCase): expected = get_links("[link](www.link.com) this is a link") self.assertEqual(actual, expected[0].get("href")) - def test_is_url(self): - """This will return a test to see if the link is a valid url format""" - expected = is_url("http://www.fake.g00gl3.com") - self.assertEqual(True, expected) - - expected = is_url("http://www.google.com") - self.assertEqual(True, expected) - - expected = is_url("http://www.github.com") - self.assertEqual(True, expected) + def test_multiple_get_links(self): + """ This will ensure that get links will + return a list of multiple md links """ + expected = ["www.link.com", "./link"] + result = get_links("this is a [link](www.link.com) and [link](./link)") - expected = is_url("./sync.py") - self.assertEqual(False, expected) + for index, link in enumerate(result): + self.assertEqual(link.get("href"), expected[index]) - expected = is_url("www.github.com") - self.assertEqual(False, expected) + def test_is_absolute_url(self): + """This will return a test to see if the link is a valid url format""" + self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com"))) + self.assertTrue(is_absolute_url(urlparse("http://www.google.com"))) + self.assertFalse(is_absolute_url(urlparse("www.google.com"))) + self.assertFalse(is_absolute_url(urlparse(".sync.py"))) + self.assertFalse(is_absolute_url(urlparse("#fragment"))) + + def test_transform_link(self): + base_path = './test-content' + rewrite_path = '/docs/foo' + rewrite_url = 'https://foo.bar' + self.assertEqual( + transform_link("", base_path, rewrite_path, rewrite_url), "") + self.assertEqual( + transform_link("http://test.com", base_path, rewrite_path, rewrite_url), + "http://test.com") + self.assertEqual( + transform_link("test.txt", base_path, rewrite_path, rewrite_url), + "/docs/foo/test.txt") + self.assertEqual( + transform_link("content.md", base_path, rewrite_path, rewrite_url), + "/docs/foo/content/") + self.assertEqual( + transform_link("notthere.txt", base_path, rewrite_path, rewrite_url), + "https://foo.bar/notthere.txt") def test_transform_text(self): """Ensure that transform links will turns links to relative github link or existing file name""" - expected = """ - [invalid-relative-link](test.com/./adw/a/d/awdrelative) - [valid-relative-link](./sync.py) - [valid-absolute-link](www.github.com) - [invalid-absolute-link](https://website-invalid-random321.net) - [valid-ref-link](#footer) - """ - text = """ - [invalid-relative-link](./adw/a/d/awdrelative) - [valid-relative-link](./sync.py) - [valid-absolute-link](www.github.com) - [invalid-absolute-link](https://website-invalid-random321.net) - [valid-ref-link](#footer) - """ + expected = ( + "[exists-relative-link](test-content/test.txt)\n" + "[exists-relative-link](test-content/content/)\n" + "[exists-relative-link-fragment](test-content/test.txt#fragment)\n" + "[notfound-relative-link](http://test.com/this/is/not/found)\n" + "[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n" + "[invalid-absolute-link](http://test.com/www.github.com)\n" + "[valid-absolute-link](https://website-invalid-random321.net) " + "[valid-ref-link](#footer)" + ) + text = ( + "[exists-relative-link](./test.txt)\n" + "[exists-relative-link](./content.md)\n" + "[exists-relative-link-fragment](test.txt#fragment)\n" + "[notfound-relative-link](./this/is/not/found)\n" + "[notfound-relative-link-fragment](./this/is/not/found#fragment)\n" + "[invalid-absolute-link](www.github.com)\n" + "[valid-absolute-link](https://website-invalid-random321.net) " + "[valid-ref-link](#footer)" + ) - actual = None - tmp_name = None + content_file = "content.md" # write to file - with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp: - tmp_name = tmp.name - name = self.path_leaf(tmp_name) - tmp.write(text.strip().encode()) - - # mutate file - transform_text("", "/tmp", {name: name}, "test.com") - # read and delete file - actual = self.read_and_delete_file(tmp_name) - - self.assertEqual(actual.strip(), expected.strip()) + with tempfile.TemporaryDirectory() as tmpdirname: + with open(os.path.join(tmpdirname, content_file), 'w+') as content: + content.write(text.strip()) + with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test: + test.write(text.strip()) + + # mutate file + transform_text(folder=tmpdirname, + files={content_file: content_file}, + base_path="test-content", + base_url="http://test.com") + # read the result + actual = "" + with open(os.path.join(tmpdirname, content_file), 'r') as result: + actual = result.read() + + self.assertEqual(actual.strip(), expected.strip()) if __name__ == '__main__': -- GitLab