提交 c09354a6 编写于 作者: A Andrea Frittoli 提交者: tekton-robot

Rewrite the link re-writing logic

The previous logic parses URLs with a regex first and then using
the markdown library. It then uses urlopen().read() to validate
links.

We use now the markdown library only to extract the list of links,
and then urlparse to deconstruct, analyse, adapt and reconstract
the link. We do not attempt to fetch links anymore, which means
that external links are not guaranteed to be working.

Absolute URLs are not changed (they may be external)
Fragments are relative to the page and do not need changes
Path only links should point to a file synced to the website
but sometimes the file may be missing (if it's not in the sync
configuration), so we follow this approach:
- prefix with base_path and check for the file locally
- if not found, prefix with base_url instead

Note that urlparse treats URLs without scheme like path only
URLs, so 'github.com' will be rewritten to base_url/github.com
Signed-off-by: NAndrea Frittoli <andrea.frittoli@gmail.com>
上级 d1eca436
...@@ -26,16 +26,13 @@ from multiprocessing import Pool ...@@ -26,16 +26,13 @@ from multiprocessing import Pool
import os import os
import os.path import os.path
from pathlib import Path from pathlib import Path
import re
import shutil import shutil
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError from urllib.error import URLError
from urllib.parse import urlparse, urljoin, urlunparse
import wget import wget
from absl import app from absl import app
from absl import flags from absl import flags
import markdown
from jinja2 import Environment from jinja2 import Environment
from jinja2 import FileSystemLoader from jinja2 import FileSystemLoader
from lxml import etree from lxml import etree
...@@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates' ...@@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates'
VAULT_DIR = './content/en/vault' VAULT_DIR = './content/en/vault'
BUCKET_NAME = 'tekton-website-assets' BUCKET_NAME = 'tekton-website-assets'
LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)'
jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR)) jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
def transform_text(link_prefix, dest_prefix, files, url): def transform_text(folder, files, base_path, base_url):
""" change every link to point to a valid relative file or absolute url """ """ change every link to point to a valid relative file or absolute url """
files_in_folder = [f'{folder}/{f}' for f in files.values()]
logging.info(f'Running: transforming files in {dest_prefix}') def process_file(file_in_folder):
set_lines(dest_prefix, files, url, link_prefix) for line in fileinput.input(file_in_folder, inplace=1):
logging.info(f'Completed: transformed files in {dest_prefix}')
def transform_links(line, url, link_prefix):
line, is_transformed = sanitize_text(link_prefix, line)
links = get_links(line)
if is_transformed:
for link in links:
link = link.get("href")
if not(os.path.isfile(link) or is_url(link) or is_ref(link)):
line = line.replace(link, github_link(url, link))
print(line)
def set_lines(dest_prefix, files, url, link_prefix):
""" get all the text from the files and replace
each line of text with the list lines """
dest_files = [f'{dest_prefix}/{f}' for f in files.values()]
def process_file(dest_file):
for line in fileinput.input(dest_file, inplace=1):
# add a line of text to the payload # add a line of text to the payload
# transform_links mutates text and set the lines provided # transform_links mutates text and set the lines provided
transform_links(line, url, link_prefix) print(transform_line(line, folder, base_path, base_url))
with Pool() as pool:
pool.imap_unordered(process_file, dest_files)
for file_in_folder in files_in_folder:
process_file(file_in_folder)
def github_link(url, link):
""" given a github raw link convert it to the main github link """
return f'{url.replace("raw", "tree", 1)}/{link}'
def transform_line(line, base_path, rewrite_path, rewrite_url):
def sanitize_text(link_prefix, text): """ transform all the links in one line """
""" santize every line of text to exclude relative line = line.rstrip()
links and to turn markdown file URL's to html """ links = get_links(line)
old_line = text.rstrip() # If there are links in this line we may need to fix them
new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line) for link in links:
return new_line, old_line == new_line # link contains the text and href
href =link.get("href")
href_mod = transform_link(href, base_path, rewrite_path, rewrite_url)
def is_url(url): line = line.replace(href, href_mod)
""" check if it is a valid url """ return line
try:
urlopen(url).read()
except (HTTPError, URLError): def transform_link(link, base_path, rewrite_path, rewrite_url):
return True """ Transform hrefs to be valid URLs on the web-site
except ValueError:
return False Absolute URLs are not changed (they may be external)
Fragments are relative to the page and do not need changes
return True Path only links should point to a file synced to the website
but sometimes the file may be missing (if it's not in the sync
configuration), so we follow this approach:
def is_ref(url): - prefix with base_path and check for the file locally
- if not found, prefix with base_url instead
Note that urlparse treats URLs without scheme like path only
URLs, so 'github.com' will be rewritten to base_url/github.com
"""
# ignore empty links
if not link:
return link
# urlparse returns a named tuple
parsed = urlparse(link)
if is_absolute_url(parsed) or is_fragment(parsed):
return link
path = os.path.normpath(parsed.path)
if os.path.isfile(os.path.join(base_path, path)):
filename, ext = os.path.splitext(path)
# md files links are in the format .../[md filename]/
if ext == '.md':
path = filename + '/'
return urlunparse(parsed._replace(path="/".join([rewrite_path, path])))
# when not found on disk, append to the base_url
return urljoin(rewrite_url, link)
def is_absolute_url(parsed_url):
""" check if it is an absolute url """
return all([parsed_url.scheme, parsed_url.netloc])
def is_fragment(parsed_url):
""" determine if the url is an a link """ """ determine if the url is an a link """
if not url: return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1])
return False
return url[0] == "#"
def get_links(md): def get_links(md):
...@@ -146,7 +144,7 @@ def download_file(src_url, dest_path): ...@@ -146,7 +144,7 @@ def download_file(src_url, dest_path):
os.makedirs(os.path.dirname(dest_path), exist_ok=True) os.makedirs(os.path.dirname(dest_path), exist_ok=True)
logging.info(f'Downloading {src_url} to {dest_path}...\n') logging.info(f'Downloading {src_url} to {dest_path}...\n')
try: try:
wget.download(src_url, out=dest_path) wget.download(src_url, out=dest_path, bar=None)
except (FileExistsError, URLError): except (FileExistsError, URLError):
raise Exception(f'download failed for {src_url}') raise Exception(f'download failed for {src_url}')
...@@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list): ...@@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list):
doc_directory = remove_ending_forward_slash(entry['docDirectory']) doc_directory = remove_ending_forward_slash(entry['docDirectory'])
for index, tag in enumerate(entry['tags']): for index, tag in enumerate(entry['tags']):
host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}' logging.info(f'Syncing {component}@{tag["name"]}')
download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}'
link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}'
if index == 0: if index == 0:
# first links belongs on the home page # first links belongs on the home page
download_dir = f'/docs/{component}/' download_dir = f'/docs/{component}'.lower()
site_dir = f'{CONTENT_DIR}/{component}' site_dir = f'{CONTENT_DIR}/{component}'
else: else:
# the other links belong in the other versions a.k.a vault # the other links belong in the other versions a.k.a vault
download_dir = f'/vault/{component}-{tag["displayName"]}/' download_dir = f'/vault/{component}-{tag["displayName"]}'
site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}' site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}'
download_files(host_dir, site_dir, tag["files"]) download_files(download_url, site_dir, tag["files"])
transform_text(download_dir, site_dir, tag["files"], host_dir) transform_text(site_dir, tag["files"], download_dir, link_base_url)
def get_files(path, file_type): def get_files(path, file_type):
......
...@@ -19,17 +19,14 @@ import tempfile ...@@ -19,17 +19,14 @@ import tempfile
import shutil import shutil
import ntpath import ntpath
import os import os
from shutil import copytree
from urllib.parse import urlparse
from sync import get_links from sync import (
from sync import transform_text get_links, transform_text, is_absolute_url,
from sync import is_url is_fragment, remove_ending_forward_slash,
from sync import is_ref get_tags, download_files, load_config, save_config,
from sync import remove_ending_forward_slash get_files, transform_link)
from sync import get_tags
from sync import download_files
from sync import load_config
from sync import save_config
from sync import get_files
class TestSync(unittest.TestCase): class TestSync(unittest.TestCase):
...@@ -48,22 +45,12 @@ class TestSync(unittest.TestCase): ...@@ -48,22 +45,12 @@ class TestSync(unittest.TestCase):
return text return text
# Tests # Tests
def test_is_fragment(self):
def test_multiple_get_links(self):
""" This will ensure that get links will
return a list of multiple md links """
expected = ["www.link.com", "./link"]
result = get_links("this is a [link](www.link.com) and [link](./link)")
for index, link in enumerate(result):
self.assertEqual(link.get("href"), expected[index])
def test_is_ref(self):
""" Verify if a string is a reference. A reference is """ Verify if a string is a reference. A reference is
defined as a string where its first character is a hashtag """ defined as a string where its first character is a hashtag """
self.assertEqual(is_ref(""), False) self.assertFalse(is_fragment(urlparse("")))
self.assertEqual(is_ref("#footer"), True) self.assertTrue(is_fragment(urlparse("#footer")))
self.assertEqual(is_ref("www.google.com"), False) self.assertFalse(is_fragment(urlparse("www.google.com")))
def test_remove_ending_forward_slash(self): def test_remove_ending_forward_slash(self):
""" Remove a slash if it is the last character in a string """ """ Remove a slash if it is the last character in a string """
...@@ -160,57 +147,87 @@ class TestSync(unittest.TestCase): ...@@ -160,57 +147,87 @@ class TestSync(unittest.TestCase):
expected = get_links("[link](www.link.com) this is a link") expected = get_links("[link](www.link.com) this is a link")
self.assertEqual(actual, expected[0].get("href")) self.assertEqual(actual, expected[0].get("href"))
def test_is_url(self): def test_multiple_get_links(self):
"""This will return a test to see if the link is a valid url format""" """ This will ensure that get links will
expected = is_url("http://www.fake.g00gl3.com") return a list of multiple md links """
self.assertEqual(True, expected) expected = ["www.link.com", "./link"]
result = get_links("this is a [link](www.link.com) and [link](./link)")
expected = is_url("http://www.google.com")
self.assertEqual(True, expected)
expected = is_url("http://www.github.com")
self.assertEqual(True, expected)
expected = is_url("./sync.py") for index, link in enumerate(result):
self.assertEqual(False, expected) self.assertEqual(link.get("href"), expected[index])
expected = is_url("www.github.com") def test_is_absolute_url(self):
self.assertEqual(False, expected) """This will return a test to see if the link is a valid url format"""
self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com")))
self.assertTrue(is_absolute_url(urlparse("http://www.google.com")))
self.assertFalse(is_absolute_url(urlparse("www.google.com")))
self.assertFalse(is_absolute_url(urlparse(".sync.py")))
self.assertFalse(is_absolute_url(urlparse("#fragment")))
def test_transform_link(self):
base_path = './test-content'
rewrite_path = '/docs/foo'
rewrite_url = 'https://foo.bar'
self.assertEqual(
transform_link("", base_path, rewrite_path, rewrite_url), "")
self.assertEqual(
transform_link("http://test.com", base_path, rewrite_path, rewrite_url),
"http://test.com")
self.assertEqual(
transform_link("test.txt", base_path, rewrite_path, rewrite_url),
"/docs/foo/test.txt")
self.assertEqual(
transform_link("content.md", base_path, rewrite_path, rewrite_url),
"/docs/foo/content/")
self.assertEqual(
transform_link("notthere.txt", base_path, rewrite_path, rewrite_url),
"https://foo.bar/notthere.txt")
def test_transform_text(self): def test_transform_text(self):
"""Ensure that transform links will turns links to """Ensure that transform links will turns links to
relative github link or existing file name""" relative github link or existing file name"""
expected = """ expected = (
[invalid-relative-link](test.com/./adw/a/d/awdrelative) "[exists-relative-link](test-content/test.txt)\n"
[valid-relative-link](./sync.py) "[exists-relative-link](test-content/content/)\n"
[valid-absolute-link](www.github.com) "[exists-relative-link-fragment](test-content/test.txt#fragment)\n"
[invalid-absolute-link](https://website-invalid-random321.net) "[notfound-relative-link](http://test.com/this/is/not/found)\n"
[valid-ref-link](#footer) "[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n"
""" "[invalid-absolute-link](http://test.com/www.github.com)\n"
text = """ "[valid-absolute-link](https://website-invalid-random321.net) "
[invalid-relative-link](./adw/a/d/awdrelative) "[valid-ref-link](#footer)"
[valid-relative-link](./sync.py) )
[valid-absolute-link](www.github.com) text = (
[invalid-absolute-link](https://website-invalid-random321.net) "[exists-relative-link](./test.txt)\n"
[valid-ref-link](#footer) "[exists-relative-link](./content.md)\n"
""" "[exists-relative-link-fragment](test.txt#fragment)\n"
"[notfound-relative-link](./this/is/not/found)\n"
"[notfound-relative-link-fragment](./this/is/not/found#fragment)\n"
"[invalid-absolute-link](www.github.com)\n"
"[valid-absolute-link](https://website-invalid-random321.net) "
"[valid-ref-link](#footer)"
)
actual = None content_file = "content.md"
tmp_name = None
# write to file # write to file
with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp: with tempfile.TemporaryDirectory() as tmpdirname:
tmp_name = tmp.name with open(os.path.join(tmpdirname, content_file), 'w+') as content:
name = self.path_leaf(tmp_name) content.write(text.strip())
tmp.write(text.strip().encode()) with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test:
test.write(text.strip())
# mutate file
transform_text("", "/tmp", {name: name}, "test.com") # mutate file
# read and delete file transform_text(folder=tmpdirname,
actual = self.read_and_delete_file(tmp_name) files={content_file: content_file},
base_path="test-content",
self.assertEqual(actual.strip(), expected.strip()) base_url="http://test.com")
# read the result
actual = ""
with open(os.path.join(tmpdirname, content_file), 'r') as result:
actual = result.read()
self.assertEqual(actual.strip(), expected.strip())
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册