diff --git a/sync/sync.py b/sync/sync.py index 4faac29a2e5a9144719139980e63260499bf86fb..a22628f87b04a9c31da35aa98b618d10c6414471 100755 --- a/sync/sync.py +++ b/sync/sync.py @@ -26,16 +26,13 @@ from multiprocessing import Pool import os import os.path from pathlib import Path -import re import shutil -from urllib.request import urlopen -from urllib.error import HTTPError from urllib.error import URLError +from urllib.parse import urlparse, urljoin, urlunparse import wget from absl import app from absl import flags -import markdown from jinja2 import Environment from jinja2 import FileSystemLoader from lxml import etree @@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates' VAULT_DIR = './content/en/vault' BUCKET_NAME = 'tekton-website-assets' -LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)' - jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR)) -def transform_text(link_prefix, dest_prefix, files, url): +def transform_text(folder, files, base_path, base_url): """ change every link to point to a valid relative file or absolute url """ + files_in_folder = [f'{folder}/{f}' for f in files.values()] - logging.info(f'Running: transforming files in {dest_prefix}') - set_lines(dest_prefix, files, url, link_prefix) - logging.info(f'Completed: transformed files in {dest_prefix}') - - -def transform_links(line, url, link_prefix): - line, is_transformed = sanitize_text(link_prefix, line) - links = get_links(line) - if is_transformed: - for link in links: - link = link.get("href") - if not(os.path.isfile(link) or is_url(link) or is_ref(link)): - line = line.replace(link, github_link(url, link)) - print(line) - - -def set_lines(dest_prefix, files, url, link_prefix): - """ get all the text from the files and replace - each line of text with the list lines """ - dest_files = [f'{dest_prefix}/{f}' for f in files.values()] - - def process_file(dest_file): - for line in fileinput.input(dest_file, inplace=1): + def process_file(file_in_folder): + for line in fileinput.input(file_in_folder, inplace=1): # add a line of text to the payload # transform_links mutates text and set the lines provided - transform_links(line, url, link_prefix) - - with Pool() as pool: - pool.imap_unordered(process_file, dest_files) + print(transform_line(line, folder, base_path, base_url)) + for file_in_folder in files_in_folder: + process_file(file_in_folder) -def github_link(url, link): - """ given a github raw link convert it to the main github link """ - return f'{url.replace("raw", "tree", 1)}/{link}' - -def sanitize_text(link_prefix, text): - """ santize every line of text to exclude relative - links and to turn markdown file URL's to html """ - old_line = text.rstrip() - new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line) - return new_line, old_line == new_line - - -def is_url(url): - """ check if it is a valid url """ - try: - urlopen(url).read() - except (HTTPError, URLError): - return True - except ValueError: - return False - - return True - - -def is_ref(url): +def transform_line(line, base_path, rewrite_path, rewrite_url): + """ transform all the links in one line """ + line = line.rstrip() + links = get_links(line) + # If there are links in this line we may need to fix them + for link in links: + # link contains the text and href + href =link.get("href") + href_mod = transform_link(href, base_path, rewrite_path, rewrite_url) + line = line.replace(href, href_mod) + return line + + +def transform_link(link, base_path, rewrite_path, rewrite_url): + """ Transform hrefs to be valid URLs on the web-site + + Absolute URLs are not changed (they may be external) + Fragments are relative to the page and do not need changes + Path only links should point to a file synced to the website + but sometimes the file may be missing (if it's not in the sync + configuration), so we follow this approach: + - prefix with base_path and check for the file locally + - if not found, prefix with base_url instead + + Note that urlparse treats URLs without scheme like path only + URLs, so 'github.com' will be rewritten to base_url/github.com + """ + + # ignore empty links + if not link: + return link + # urlparse returns a named tuple + parsed = urlparse(link) + if is_absolute_url(parsed) or is_fragment(parsed): + return link + path = os.path.normpath(parsed.path) + if os.path.isfile(os.path.join(base_path, path)): + filename, ext = os.path.splitext(path) + # md files links are in the format .../[md filename]/ + if ext == '.md': + path = filename + '/' + return urlunparse(parsed._replace(path="/".join([rewrite_path, path]))) + # when not found on disk, append to the base_url + return urljoin(rewrite_url, link) + + +def is_absolute_url(parsed_url): + """ check if it is an absolute url """ + return all([parsed_url.scheme, parsed_url.netloc]) + + +def is_fragment(parsed_url): """ determine if the url is an a link """ - if not url: - return False - - return url[0] == "#" + return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1]) def get_links(md): @@ -146,7 +144,7 @@ def download_file(src_url, dest_path): os.makedirs(os.path.dirname(dest_path), exist_ok=True) logging.info(f'Downloading {src_url} to {dest_path}...\n') try: - wget.download(src_url, out=dest_path) + wget.download(src_url, out=dest_path, bar=None) except (FileExistsError, URLError): raise Exception(f'download failed for {src_url}') @@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list): doc_directory = remove_ending_forward_slash(entry['docDirectory']) for index, tag in enumerate(entry['tags']): - host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}' + logging.info(f'Syncing {component}@{tag["name"]}') + download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}' + link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}' if index == 0: # first links belongs on the home page - download_dir = f'/docs/{component}/' + download_dir = f'/docs/{component}'.lower() site_dir = f'{CONTENT_DIR}/{component}' else: # the other links belong in the other versions a.k.a vault - download_dir = f'/vault/{component}-{tag["displayName"]}/' + download_dir = f'/vault/{component}-{tag["displayName"]}' site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}' - download_files(host_dir, site_dir, tag["files"]) - transform_text(download_dir, site_dir, tag["files"], host_dir) + download_files(download_url, site_dir, tag["files"]) + transform_text(site_dir, tag["files"], download_dir, link_base_url) def get_files(path, file_type): diff --git a/sync/test-content/content.md b/sync/test-content/content.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sync/test-content/test.txt b/sync/test-content/test.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sync/test_sync.py b/sync/test_sync.py index f5601c0839ecc259cb543f4679065c4abf370bc6..ed450710683150eba699c51bc3cca77fdc602321 100644 --- a/sync/test_sync.py +++ b/sync/test_sync.py @@ -19,17 +19,14 @@ import tempfile import shutil import ntpath import os +from shutil import copytree +from urllib.parse import urlparse -from sync import get_links -from sync import transform_text -from sync import is_url -from sync import is_ref -from sync import remove_ending_forward_slash -from sync import get_tags -from sync import download_files -from sync import load_config -from sync import save_config -from sync import get_files +from sync import ( + get_links, transform_text, is_absolute_url, + is_fragment, remove_ending_forward_slash, + get_tags, download_files, load_config, save_config, + get_files, transform_link) class TestSync(unittest.TestCase): @@ -48,22 +45,12 @@ class TestSync(unittest.TestCase): return text # Tests - - def test_multiple_get_links(self): - """ This will ensure that get links will - return a list of multiple md links """ - expected = ["www.link.com", "./link"] - result = get_links("this is a [link](www.link.com) and [link](./link)") - - for index, link in enumerate(result): - self.assertEqual(link.get("href"), expected[index]) - - def test_is_ref(self): + def test_is_fragment(self): """ Verify if a string is a reference. A reference is defined as a string where its first character is a hashtag """ - self.assertEqual(is_ref(""), False) - self.assertEqual(is_ref("#footer"), True) - self.assertEqual(is_ref("www.google.com"), False) + self.assertFalse(is_fragment(urlparse(""))) + self.assertTrue(is_fragment(urlparse("#footer"))) + self.assertFalse(is_fragment(urlparse("www.google.com"))) def test_remove_ending_forward_slash(self): """ Remove a slash if it is the last character in a string """ @@ -160,57 +147,87 @@ class TestSync(unittest.TestCase): expected = get_links("[link](www.link.com) this is a link") self.assertEqual(actual, expected[0].get("href")) - def test_is_url(self): - """This will return a test to see if the link is a valid url format""" - expected = is_url("http://www.fake.g00gl3.com") - self.assertEqual(True, expected) - - expected = is_url("http://www.google.com") - self.assertEqual(True, expected) - - expected = is_url("http://www.github.com") - self.assertEqual(True, expected) + def test_multiple_get_links(self): + """ This will ensure that get links will + return a list of multiple md links """ + expected = ["www.link.com", "./link"] + result = get_links("this is a [link](www.link.com) and [link](./link)") - expected = is_url("./sync.py") - self.assertEqual(False, expected) + for index, link in enumerate(result): + self.assertEqual(link.get("href"), expected[index]) - expected = is_url("www.github.com") - self.assertEqual(False, expected) + def test_is_absolute_url(self): + """This will return a test to see if the link is a valid url format""" + self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com"))) + self.assertTrue(is_absolute_url(urlparse("http://www.google.com"))) + self.assertFalse(is_absolute_url(urlparse("www.google.com"))) + self.assertFalse(is_absolute_url(urlparse(".sync.py"))) + self.assertFalse(is_absolute_url(urlparse("#fragment"))) + + def test_transform_link(self): + base_path = './test-content' + rewrite_path = '/docs/foo' + rewrite_url = 'https://foo.bar' + self.assertEqual( + transform_link("", base_path, rewrite_path, rewrite_url), "") + self.assertEqual( + transform_link("http://test.com", base_path, rewrite_path, rewrite_url), + "http://test.com") + self.assertEqual( + transform_link("test.txt", base_path, rewrite_path, rewrite_url), + "/docs/foo/test.txt") + self.assertEqual( + transform_link("content.md", base_path, rewrite_path, rewrite_url), + "/docs/foo/content/") + self.assertEqual( + transform_link("notthere.txt", base_path, rewrite_path, rewrite_url), + "https://foo.bar/notthere.txt") def test_transform_text(self): """Ensure that transform links will turns links to relative github link or existing file name""" - expected = """ - [invalid-relative-link](test.com/./adw/a/d/awdrelative) - [valid-relative-link](./sync.py) - [valid-absolute-link](www.github.com) - [invalid-absolute-link](https://website-invalid-random321.net) - [valid-ref-link](#footer) - """ - text = """ - [invalid-relative-link](./adw/a/d/awdrelative) - [valid-relative-link](./sync.py) - [valid-absolute-link](www.github.com) - [invalid-absolute-link](https://website-invalid-random321.net) - [valid-ref-link](#footer) - """ + expected = ( + "[exists-relative-link](test-content/test.txt)\n" + "[exists-relative-link](test-content/content/)\n" + "[exists-relative-link-fragment](test-content/test.txt#fragment)\n" + "[notfound-relative-link](http://test.com/this/is/not/found)\n" + "[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n" + "[invalid-absolute-link](http://test.com/www.github.com)\n" + "[valid-absolute-link](https://website-invalid-random321.net) " + "[valid-ref-link](#footer)" + ) + text = ( + "[exists-relative-link](./test.txt)\n" + "[exists-relative-link](./content.md)\n" + "[exists-relative-link-fragment](test.txt#fragment)\n" + "[notfound-relative-link](./this/is/not/found)\n" + "[notfound-relative-link-fragment](./this/is/not/found#fragment)\n" + "[invalid-absolute-link](www.github.com)\n" + "[valid-absolute-link](https://website-invalid-random321.net) " + "[valid-ref-link](#footer)" + ) - actual = None - tmp_name = None + content_file = "content.md" # write to file - with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp: - tmp_name = tmp.name - name = self.path_leaf(tmp_name) - tmp.write(text.strip().encode()) - - # mutate file - transform_text("", "/tmp", {name: name}, "test.com") - # read and delete file - actual = self.read_and_delete_file(tmp_name) - - self.assertEqual(actual.strip(), expected.strip()) + with tempfile.TemporaryDirectory() as tmpdirname: + with open(os.path.join(tmpdirname, content_file), 'w+') as content: + content.write(text.strip()) + with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test: + test.write(text.strip()) + + # mutate file + transform_text(folder=tmpdirname, + files={content_file: content_file}, + base_path="test-content", + base_url="http://test.com") + # read the result + actual = "" + with open(os.path.join(tmpdirname, content_file), 'r') as result: + actual = result.read() + + self.assertEqual(actual.strip(), expected.strip()) if __name__ == '__main__':