提交 c09354a6 编写于 作者: A Andrea Frittoli 提交者: tekton-robot

Rewrite the link re-writing logic

The previous logic parses URLs with a regex first and then using
the markdown library. It then uses urlopen().read() to validate
links.

We use now the markdown library only to extract the list of links,
and then urlparse to deconstruct, analyse, adapt and reconstract
the link. We do not attempt to fetch links anymore, which means
that external links are not guaranteed to be working.

Absolute URLs are not changed (they may be external)
Fragments are relative to the page and do not need changes
Path only links should point to a file synced to the website
but sometimes the file may be missing (if it's not in the sync
configuration), so we follow this approach:
- prefix with base_path and check for the file locally
- if not found, prefix with base_url instead

Note that urlparse treats URLs without scheme like path only
URLs, so 'github.com' will be rewritten to base_url/github.com
Signed-off-by: NAndrea Frittoli <andrea.frittoli@gmail.com>
上级 d1eca436
......@@ -26,16 +26,13 @@ from multiprocessing import Pool
import os
import os.path
from pathlib import Path
import re
import shutil
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.parse import urlparse, urljoin, urlunparse
import wget
from absl import app
from absl import flags
import markdown
from jinja2 import Environment
from jinja2 import FileSystemLoader
from lxml import etree
......@@ -58,76 +55,77 @@ TEMPLATE_DIR = './templates'
VAULT_DIR = './content/en/vault'
BUCKET_NAME = 'tekton-website-assets'
LINKS_RE = r'\[([^\]]*)\]\((?!.*://|/)([^)]*).md(#[^)]*)?\)'
jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
def transform_text(link_prefix, dest_prefix, files, url):
def transform_text(folder, files, base_path, base_url):
""" change every link to point to a valid relative file or absolute url """
files_in_folder = [f'{folder}/{f}' for f in files.values()]
logging.info(f'Running: transforming files in {dest_prefix}')
set_lines(dest_prefix, files, url, link_prefix)
logging.info(f'Completed: transformed files in {dest_prefix}')
def transform_links(line, url, link_prefix):
line, is_transformed = sanitize_text(link_prefix, line)
links = get_links(line)
if is_transformed:
for link in links:
link = link.get("href")
if not(os.path.isfile(link) or is_url(link) or is_ref(link)):
line = line.replace(link, github_link(url, link))
print(line)
def set_lines(dest_prefix, files, url, link_prefix):
""" get all the text from the files and replace
each line of text with the list lines """
dest_files = [f'{dest_prefix}/{f}' for f in files.values()]
def process_file(dest_file):
for line in fileinput.input(dest_file, inplace=1):
def process_file(file_in_folder):
for line in fileinput.input(file_in_folder, inplace=1):
# add a line of text to the payload
# transform_links mutates text and set the lines provided
transform_links(line, url, link_prefix)
with Pool() as pool:
pool.imap_unordered(process_file, dest_files)
print(transform_line(line, folder, base_path, base_url))
for file_in_folder in files_in_folder:
process_file(file_in_folder)
def github_link(url, link):
""" given a github raw link convert it to the main github link """
return f'{url.replace("raw", "tree", 1)}/{link}'
def sanitize_text(link_prefix, text):
""" santize every line of text to exclude relative
links and to turn markdown file URL's to html """
old_line = text.rstrip()
new_line = re.sub(LINKS_RE, r'[\1](' + link_prefix + r'\2\3)', old_line)
return new_line, old_line == new_line
def is_url(url):
""" check if it is a valid url """
try:
urlopen(url).read()
except (HTTPError, URLError):
return True
except ValueError:
return False
return True
def is_ref(url):
def transform_line(line, base_path, rewrite_path, rewrite_url):
""" transform all the links in one line """
line = line.rstrip()
links = get_links(line)
# If there are links in this line we may need to fix them
for link in links:
# link contains the text and href
href =link.get("href")
href_mod = transform_link(href, base_path, rewrite_path, rewrite_url)
line = line.replace(href, href_mod)
return line
def transform_link(link, base_path, rewrite_path, rewrite_url):
""" Transform hrefs to be valid URLs on the web-site
Absolute URLs are not changed (they may be external)
Fragments are relative to the page and do not need changes
Path only links should point to a file synced to the website
but sometimes the file may be missing (if it's not in the sync
configuration), so we follow this approach:
- prefix with base_path and check for the file locally
- if not found, prefix with base_url instead
Note that urlparse treats URLs without scheme like path only
URLs, so 'github.com' will be rewritten to base_url/github.com
"""
# ignore empty links
if not link:
return link
# urlparse returns a named tuple
parsed = urlparse(link)
if is_absolute_url(parsed) or is_fragment(parsed):
return link
path = os.path.normpath(parsed.path)
if os.path.isfile(os.path.join(base_path, path)):
filename, ext = os.path.splitext(path)
# md files links are in the format .../[md filename]/
if ext == '.md':
path = filename + '/'
return urlunparse(parsed._replace(path="/".join([rewrite_path, path])))
# when not found on disk, append to the base_url
return urljoin(rewrite_url, link)
def is_absolute_url(parsed_url):
""" check if it is an absolute url """
return all([parsed_url.scheme, parsed_url.netloc])
def is_fragment(parsed_url):
""" determine if the url is an a link """
if not url:
return False
return url[0] == "#"
return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1])
def get_links(md):
......@@ -146,7 +144,7 @@ def download_file(src_url, dest_path):
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
logging.info(f'Downloading {src_url} to {dest_path}...\n')
try:
wget.download(src_url, out=dest_path)
wget.download(src_url, out=dest_path, bar=None)
except (FileExistsError, URLError):
raise Exception(f'download failed for {src_url}')
......@@ -179,18 +177,20 @@ def download_resources_to_project(yaml_list):
doc_directory = remove_ending_forward_slash(entry['docDirectory'])
for index, tag in enumerate(entry['tags']):
host_dir = f'{repository}/raw/{tag["name"]}/{doc_directory}'
logging.info(f'Syncing {component}@{tag["name"]}')
download_url = f'{repository}/raw/{tag["name"]}/{doc_directory}'
link_base_url = f'{repository}/tree/{tag["name"]}/{doc_directory}'
if index == 0:
# first links belongs on the home page
download_dir = f'/docs/{component}/'
download_dir = f'/docs/{component}'.lower()
site_dir = f'{CONTENT_DIR}/{component}'
else:
# the other links belong in the other versions a.k.a vault
download_dir = f'/vault/{component}-{tag["displayName"]}/'
download_dir = f'/vault/{component}-{tag["displayName"]}'
site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}'
download_files(host_dir, site_dir, tag["files"])
transform_text(download_dir, site_dir, tag["files"], host_dir)
download_files(download_url, site_dir, tag["files"])
transform_text(site_dir, tag["files"], download_dir, link_base_url)
def get_files(path, file_type):
......
......@@ -19,17 +19,14 @@ import tempfile
import shutil
import ntpath
import os
from shutil import copytree
from urllib.parse import urlparse
from sync import get_links
from sync import transform_text
from sync import is_url
from sync import is_ref
from sync import remove_ending_forward_slash
from sync import get_tags
from sync import download_files
from sync import load_config
from sync import save_config
from sync import get_files
from sync import (
get_links, transform_text, is_absolute_url,
is_fragment, remove_ending_forward_slash,
get_tags, download_files, load_config, save_config,
get_files, transform_link)
class TestSync(unittest.TestCase):
......@@ -48,22 +45,12 @@ class TestSync(unittest.TestCase):
return text
# Tests
def test_multiple_get_links(self):
""" This will ensure that get links will
return a list of multiple md links """
expected = ["www.link.com", "./link"]
result = get_links("this is a [link](www.link.com) and [link](./link)")
for index, link in enumerate(result):
self.assertEqual(link.get("href"), expected[index])
def test_is_ref(self):
def test_is_fragment(self):
""" Verify if a string is a reference. A reference is
defined as a string where its first character is a hashtag """
self.assertEqual(is_ref(""), False)
self.assertEqual(is_ref("#footer"), True)
self.assertEqual(is_ref("www.google.com"), False)
self.assertFalse(is_fragment(urlparse("")))
self.assertTrue(is_fragment(urlparse("#footer")))
self.assertFalse(is_fragment(urlparse("www.google.com")))
def test_remove_ending_forward_slash(self):
""" Remove a slash if it is the last character in a string """
......@@ -160,57 +147,87 @@ class TestSync(unittest.TestCase):
expected = get_links("[link](www.link.com) this is a link")
self.assertEqual(actual, expected[0].get("href"))
def test_is_url(self):
"""This will return a test to see if the link is a valid url format"""
expected = is_url("http://www.fake.g00gl3.com")
self.assertEqual(True, expected)
expected = is_url("http://www.google.com")
self.assertEqual(True, expected)
expected = is_url("http://www.github.com")
self.assertEqual(True, expected)
def test_multiple_get_links(self):
""" This will ensure that get links will
return a list of multiple md links """
expected = ["www.link.com", "./link"]
result = get_links("this is a [link](www.link.com) and [link](./link)")
expected = is_url("./sync.py")
self.assertEqual(False, expected)
for index, link in enumerate(result):
self.assertEqual(link.get("href"), expected[index])
expected = is_url("www.github.com")
self.assertEqual(False, expected)
def test_is_absolute_url(self):
"""This will return a test to see if the link is a valid url format"""
self.assertTrue(is_absolute_url(urlparse("http://www.fake.g00gl3.com")))
self.assertTrue(is_absolute_url(urlparse("http://www.google.com")))
self.assertFalse(is_absolute_url(urlparse("www.google.com")))
self.assertFalse(is_absolute_url(urlparse(".sync.py")))
self.assertFalse(is_absolute_url(urlparse("#fragment")))
def test_transform_link(self):
base_path = './test-content'
rewrite_path = '/docs/foo'
rewrite_url = 'https://foo.bar'
self.assertEqual(
transform_link("", base_path, rewrite_path, rewrite_url), "")
self.assertEqual(
transform_link("http://test.com", base_path, rewrite_path, rewrite_url),
"http://test.com")
self.assertEqual(
transform_link("test.txt", base_path, rewrite_path, rewrite_url),
"/docs/foo/test.txt")
self.assertEqual(
transform_link("content.md", base_path, rewrite_path, rewrite_url),
"/docs/foo/content/")
self.assertEqual(
transform_link("notthere.txt", base_path, rewrite_path, rewrite_url),
"https://foo.bar/notthere.txt")
def test_transform_text(self):
"""Ensure that transform links will turns links to
relative github link or existing file name"""
expected = """
[invalid-relative-link](test.com/./adw/a/d/awdrelative)
[valid-relative-link](./sync.py)
[valid-absolute-link](www.github.com)
[invalid-absolute-link](https://website-invalid-random321.net)
[valid-ref-link](#footer)
"""
text = """
[invalid-relative-link](./adw/a/d/awdrelative)
[valid-relative-link](./sync.py)
[valid-absolute-link](www.github.com)
[invalid-absolute-link](https://website-invalid-random321.net)
[valid-ref-link](#footer)
"""
expected = (
"[exists-relative-link](test-content/test.txt)\n"
"[exists-relative-link](test-content/content/)\n"
"[exists-relative-link-fragment](test-content/test.txt#fragment)\n"
"[notfound-relative-link](http://test.com/this/is/not/found)\n"
"[notfound-relative-link-fragment](http://test.com/this/is/not/found#fragment)\n"
"[invalid-absolute-link](http://test.com/www.github.com)\n"
"[valid-absolute-link](https://website-invalid-random321.net) "
"[valid-ref-link](#footer)"
)
text = (
"[exists-relative-link](./test.txt)\n"
"[exists-relative-link](./content.md)\n"
"[exists-relative-link-fragment](test.txt#fragment)\n"
"[notfound-relative-link](./this/is/not/found)\n"
"[notfound-relative-link-fragment](./this/is/not/found#fragment)\n"
"[invalid-absolute-link](www.github.com)\n"
"[valid-absolute-link](https://website-invalid-random321.net) "
"[valid-ref-link](#footer)"
)
actual = None
tmp_name = None
content_file = "content.md"
# write to file
with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as tmp:
tmp_name = tmp.name
name = self.path_leaf(tmp_name)
tmp.write(text.strip().encode())
# mutate file
transform_text("", "/tmp", {name: name}, "test.com")
# read and delete file
actual = self.read_and_delete_file(tmp_name)
self.assertEqual(actual.strip(), expected.strip())
with tempfile.TemporaryDirectory() as tmpdirname:
with open(os.path.join(tmpdirname, content_file), 'w+') as content:
content.write(text.strip())
with open(os.path.join(tmpdirname, 'test.txt'), 'w+') as test:
test.write(text.strip())
# mutate file
transform_text(folder=tmpdirname,
files={content_file: content_file},
base_path="test-content",
base_url="http://test.com")
# read the result
actual = ""
with open(os.path.join(tmpdirname, content_file), 'r') as result:
actual = result.read()
self.assertEqual(actual.strip(), expected.strip())
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册