sync.py 18.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#!/usr/bin/env python

# Copyright 2020 The Tekton Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

M
michaelawyu 已提交
17 18 19 20
# This script helps synchronize contents from their respective sources of
# truth (usually GitHub repositories of each Tekton
# components, such as tektoncd/pipelines) to tektoncd/website.

21 22
import copy
import fnmatch
A
Andrea Frittoli 已提交
23 24
import json
import logging
25 26
import markdown
from multiprocessing import Pool
M
michaelawyu 已提交
27
import os
A
Andrea Frittoli 已提交
28
import os.path
29
import re
30
import sys
31
from urllib.error import URLError
32
from urllib.parse import urlparse, urljoin, urlunparse
M
michaelawyu 已提交
33

34
from bs4 import BeautifulSoup
35
import click
36
import git
B
Billy Lynch 已提交
37 38
from jinja2 import Environment
from jinja2 import FileSystemLoader
39
from ruamel.yaml import YAML
P
popcor255 已提交
40

M
michaelawyu 已提交
41 42

CONTENT_DIR = './content/en/docs'
43
VAULT_DIR = './content/en/vault'
M
michaelawyu 已提交
44 45
JS_ASSET_DIR = './assets/js'
TEMPLATE_DIR = './templates'
46

47 48 49
BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
DEFAULT_CONFIG_FOLDER = os.path.join(BASE_FOLDER, 'config')
DEFAULT_CACHE_FOLDER = os.path.join(BASE_FOLDER, '.cache')
M
michaelawyu 已提交
50 51 52

jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))

53
FM_BOUNDARY = re.compile(r"^(?:<!--\n)?-{3,}\s*$(?:\n-->)?", re.MULTILINE)
54
YAML_SEPARATOR = "---\n"
55 56
FM_WRAPPER_OPEN = "<!--\n"
FM_WRAPPER_CLOSE = "-->\n"
57

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
FOLDER_INDEX = '_index.md'


def doc_config(doc, folder_config, weight=None):
    """ Return the target name, folder and header for doc based on folder_config

    :param doc: the doc as a gitpython Blob
    :param folder_config: a dict with the configuration of the folder the doc
      was found in, as specified in the sync config file under `folders`
    :params weight: optional weight of the doc. When specified it's set in the
      returned header dict
    :returns: a tuple (target_filename, target_folder, header), which describes
      which files `doc` should be written to, in which folder, with which header
    """

    index_file = folder_config.get('index', FOLDER_INDEX)
    target_folder = folder_config.get('target', '')
    # If the doc name is configured as index, rewrite it to FOLDER_INDEX
    target_filename = FOLDER_INDEX if doc.name == index_file else doc.name
    # If an header is specified, build it an return it
    header_dict = None
    if 'header' in folder_config:
        header_dict = copy.deepcopy(folder_config['header'])
        if weight is not None:
            header_dict['weight'] = weight
    return target_filename, target_folder, header_dict


def docs_from_tree(tree, include=['*'], exclude=[]):
    """ Get matching docs (git blobs) from a git tree

    Filter all blobs directly under a tree based on include and
    exclude lists. Filters are specified as list of unix style
    filename pattern:
    (https://docs.python.org/3/library/fnmatch.html) """
    return filter(lambda b:
            any(fnmatch.fnmatch(b.name, i) for i in include) and
            not any(fnmatch.fnmatch(b.name, e) for e in exclude), tree.blobs)


def transform_docs(git_repo, tag, folders, site_folder, base_path, base_url):
    """ Transform all folders configured for a tag

    :param git_repo: a gitpython Repo object, that points to the source git repo
    :param tag: a string that represent the git tag to be used
    :param folders: a list of folder names with a dict config each, loaded from
      sync config file
    :param site_folder: the root folder on disk where files shall be written to
    :param base_path: used to rewrite relative links to sync'ed files
    :param base_url: used to rewrite relative links to unknown files
    """

    # Get the root tree for the requested version from the repo
    try:
        tag = next(x for x in git_repo.tags if x.name == tag)
    except StopIteration:
114
        # When no tag is found try to match a branch (remote heads)
115
        try:
116
            tag = next(x for x in git_repo.remote().refs if x.remote_head == tag)
117
        except StopIteration:
118
            logging.error(f'No tag or branch {tag} found in {git_repo}')
119 120 121 122 123
            sys.exit(1)

    # List all relevant blobs based on the folder config
    files = []
    for folder, folder_config in folders.items():
A
Andrea Frittoli 已提交
124 125 126 127
        if folder == '.':
            root = tag.commit.tree
        else:
            root = tag.commit.tree.join(folder)
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
        docs = docs_from_tree(
            tree=root, include=folder_config.get('include', ['*']),
            exclude=folder_config.get('exclude', []))
        # zip doc, folder, targer and header so we can process them in parallel later
        files.extend([(doc, folder, *doc_config(doc, folder_config, idx))
            for idx, doc in enumerate(docs)])

    # Build a dict of all valid local links
    # This is used by `transfor_line` to identify local links
    local_files = {doc.path: (target, target_folder) for
                    doc, _, target, target_folder, _ in files}

    # Build a list of tuple of `transform_doc` parameters
    tranform_args = [
        (*f, local_files, base_path, base_url, site_folder) for f in files]
P
popcor255 已提交
143

144 145
    with Pool() as pool:
        results = pool.starmap(transform_doc, tranform_args)
P
popcor255 已提交
146

147 148
    # Return the list of files transformed
    return results
P
popcor255 已提交
149

150 151 152 153 154 155

def safe_makedirs(path):
    try:
        os.makedirs(path, exist_ok=True)
    except FileExistsError:
        pass
P
popcor255 已提交
156 157


158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
def transform_doc(doc, source_folder, target, target_folder, header,
                  local_files, base_path, base_url, site_folder):
    """ Transform a single doc to the target file

    Read a doc (git blob), transform links in it
    and writes the results in to a target file

    :param doc: The source doc as gitpython Blob
    :param source_folder: the name of the folder in the source repo where
    the file comes from
    :param target: the name of the file the transformed doc shall be written to
    :param target_folder: the folder within `site_folder` where the transformed
      doc shall be written to
    :param header: a dict with the content of a header (if any) to be prepended
      in the transformed doc
    :param local_files: a dict source file -> target used to rewrite
        relative links to sync'ed files
    :param base_path: used to rewrite relative links to sync'ed files
    :param base_url: used to rewrite relative links to unknown files
    :param site_folder: the root folder on disk where files shall be written to
    """
    site_target_folder = os.path.normpath(os.path.join(site_folder, target_folder))
    safe_makedirs(site_target_folder)
    target = os.path.join(site_target_folder, target)
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
    # Look for markdown files.
    # Some machines seem to use text/plain (e.g. running on a mac) and some use
    # text/markdown (e.g. running in a fresh ubuntu container)
    if doc.mime_type == 'text/plain' or doc.mime_type == 'text/markdown':
        with open(target, 'w+') as target_doc:
            # If there is an header configured, write it (in YAML)
            doc_all = decode(doc.data_stream.read())
            doc_markdown, fm = read_front_matter(doc_all)
            # Update the doc front matter with the configured one and write it
            write_front_matter(target_doc, fm, header)
            doc_markdown = transform_links_doc(
                doc_markdown, source_folder, local_files, base_path, base_url)
            target_doc.write(doc_markdown)
        return target
    # Pass-through for other mime types
    with open(target, 'bw+') as target_doc:
        logging.info(f'Pass-through {doc.mime_type} file {doc.path}')
        target_doc.write(doc.data_stream.read())
200 201 202
    return target


203
def decode(s, encodings=('utf8', 'latin1', 'ascii')):
204 205 206 207 208 209 210 211
    for encoding in encodings:
        try:
            return s.decode(encoding)
        except UnicodeDecodeError:
            pass
    return s.decode('ascii', 'ignore')


212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
def read_front_matter(text):
    """ returns a tuple text, frontmatter (as dict) """
    if FM_BOUNDARY.match(text):
        try:
            _, fm, content = FM_BOUNDARY.split(text, 2)
        except ValueError:
            # Not enough values to unpack, boundary was matched once
            return text, None
        if content.startswith('\n'):
            content = content[1:]
        return content, YAML().load(fm)
    else:
        return text, None

def write_front_matter(target_doc, fm_doc, fm_config):
    fm_doc = fm_doc or {}
    fm_config = fm_config or {}
    fm_doc.update(fm_config)
    if fm_doc:
231
        target_doc.write(FM_WRAPPER_OPEN)
232 233 234
        target_doc.write(YAML_SEPARATOR)
        YAML().dump(fm_doc, target_doc)
        target_doc.write(YAML_SEPARATOR)
235
        target_doc.write(FM_WRAPPER_CLOSE)
236 237 238 239 240 241 242 243

def transform_links_doc(text, base_path, local_files, rewrite_path, rewrite_url):
    """ transform all the links the text """
    links = get_links(text)
    # Rewrite map, only use links with an href
    rewrite_map = {x.get("href"): transform_link(x.get("href"), base_path, local_files, rewrite_path, rewrite_url)
        for x in links if x.get("href")}
    for source, target in rewrite_map.items():
244
        text = text.replace(f'({source})', f'({target})')
245
    return text
246 247


248 249 250
def get_links(md):
    """ return a list of all the links in a string formatted in markdown """
    md = markdown.markdown(md)
251 252
    soup = BeautifulSoup(md, 'html.parser')
    return soup.find_all("a")
253 254 255


def transform_link(link, base_path, local_files, rewrite_path, rewrite_url):
256 257
    """ Transform hrefs to be valid URLs on the web-site

258 259
    Relative URLs are rewritten to `rewrite_path` when `link`
    points to a sync'ed file. Else they're rewritten to `rewrite_url`.
260
    Absolute URLs are not changed (they may be external)
261 262 263
    Fragments are relative to the page and do not need changes,
    except for lower() on local files because hugo generated
    anchors are always lower case.
264 265 266 267 268 269 270 271 272 273 274
    :param link: the link to be re-written
    :param base_path: the folder where the source document that contains
      the link lives
    :param local_files: a dict source file -> (target file, folder) that
      maps sync'ed files from their fully qualified source name into their
      filename in the site folder
    :param rewrite_path: the file local (sync'ed) files are rewritten to
    :param rewrite_url: the URL remote files are rewritten to

    :note: urlparse treats URLs without scheme like path only URLs,
      so 'github.com' will be rewritten to 'rewrite_url/github.com'
275 276 277 278 279 280
    """
    # ignore empty links
    if not link:
        return link
    # urlparse returns a named tuple
    parsed = urlparse(link)
281
    if is_absolute_url(parsed):
282
        return link
283 284 285
    if is_fragment(parsed):
        # A fragment only link points to an .md file
        return urlunparse(parsed._replace(fragment=parsed.fragment.lower()))
286
    path = os.path.normpath(parsed.path)
287 288 289 290 291 292 293 294 295 296 297

    # The list if local_file includes paths based on the root of the git
    # repo, so we need join base_path and normalize to fq_path to find the
    # link in the list of local files
    fq_path = os.path.normpath(os.path.join(base_path, parsed.path))
    if fq_path in local_files:
        target_file = local_files[fq_path][0]
        target_folder = local_files[fq_path][1]
        is_index = (target_file == FOLDER_INDEX)
        filename, ext = os.path.splitext(target_file)
        # Special handling for md files
298
        if ext == '.md':
299 300
            # Links to the index file are rendered as base_path/
            if is_index:
301
                target_file = ''
302 303
            # links to md other files are rendered as .../[md filename]/
            else:
304
                target_file = filename + '/'
305 306
            # for .md files, lower the case of fragments to match hugo's behaviour
            parsed = parsed._replace(fragment=parsed.fragment.lower())
307
        if target_folder:
308
            new_path = [rewrite_path, target_folder, target_file]
309
        else:
310
            new_path = [rewrite_path, target_file]
311
        return parsed._replace(path="/".join(new_path)).geturl()
312
    # when not found on disk, append to the base_url
313
    return urljoin(rewrite_url, parsed._replace(path=fq_path).geturl())
314 315 316 317 318 319 320 321


def is_absolute_url(parsed_url):
    """ check if it is an absolute url """
    return all([parsed_url.scheme, parsed_url.netloc])


def is_fragment(parsed_url):
P
popcor255 已提交
322
    """ determine if the url is an a link """
323
    return len(parsed_url.fragment) > 0 and not any(parsed_url[:-1])
324

M
michaelawyu 已提交
325

326 327
def download_resources_to_project(yaml_list, clones):
    """ download the files from local clones based on a spec.
P
popcor255 已提交
328 329 330
    The YAML sync spec can be found in sync/config/README.md """
    for entry in yaml_list:
        component = entry['component']
331 332 333 334 335
        repository = entry['repository']
        local_clone = clones.get(repository)
        if not local_clone:
            logging.error(f'No git clone found for {repository} in {clones}')
            sys.exit(1)
P
popcor255 已提交
336 337

        for index, tag in enumerate(entry['tags']):
338
            logging.info(f'Syncing {component}@{tag["name"]}')
339
            link_base_url = f'{repository}/tree/{tag["name"]}/'
P
popcor255 已提交
340 341
            if index == 0:
                # first links belongs on the home page
342
                base_path = f'/docs/{component}'.lower()
P
popcor255 已提交
343
                site_dir = f'{CONTENT_DIR}/{component}'
344
                os.makedirs(site_dir, exist_ok=True)
P
popcor255 已提交
345 346
            else:
                # the other links belong in the other versions a.k.a vault
347
                base_path = f'/vault/{component}-{tag["displayName"]}'
P
popcor255 已提交
348
                site_dir = f'{VAULT_DIR}/{component}-{tag["displayName"]}'
349
                os.makedirs(site_dir, exist_ok=True)
P
popcor255 已提交
350

351 352 353 354 355 356 357 358 359
            results = transform_docs(
                git_repo=local_clone,
                tag=tag['name'],
                folders=tag['folders'],
                site_folder=site_dir,
                base_path=base_path,
                base_url=link_base_url)
            logging.debug(f'Finished syncing {component}@{tag["name"]}: ')
            logging.debug(f'{results}')
P
popcor255 已提交
360 361


362 363
def get_files_in_path(path, file_type):
    """ return a list of all the files in path that match the file_type """
P
popcor255 已提交
364
    file_list = []
M
michaelawyu 已提交
365

P
popcor255 已提交
366 367 368 369 370 371
    # walk through every file in directory and its sub directories
    for root, dirs, files in os.walk(path):
        for file in files:
            # append the file name to the list if is it the correct type
            if file.endswith(file_type):
                file_list.append(os.path.join(root, file))
372

P
popcor255 已提交
373
    return file_list
M
michaelawyu 已提交
374 375


376
def load_config(files):
377
    """ return a list of yaml files"""
378
    yaml = YAML()
P
popcor255 已提交
379 380 381 382 383
    dic_list = []

    for file in files:
        with open(file, 'r') as text:
            # get the paths from the config file
384 385 386 387
            dic_list.append({
                "filename": file,
                "content": yaml.load(text)
            })
P
popcor255 已提交
388 389 390 391

    return dic_list


392 393 394 395
def save_config(config):
    """ save config files back to yaml """
    yaml = YAML()
    for c in config:
396 397
        with open(c['filename'], 'w') as out:
            yaml.dump(c['content'], out)
398 399


A
Andrea Frittoli 已提交
400
def get_tags(sync_config):
P
popcor255 已提交
401 402
    """ return a list of tags with, there name, and displayName """
    tags = []
A
Andrea Frittoli 已提交
403
    for tag in sync_config['tags']:
P
popcor255 已提交
404 405 406 407 408 409
        tags.append({'name': tag['name'], 'displayName': tag['displayName']})
    return tags


def get_versions(sync_configs):
    """ return the list of all the versions and there tag, name, archive """
M
michaelawyu 已提交
410 411 412 413
    component_versions = []
    for sync_config in sync_configs:
        component_versions.append({
            'name': sync_config['component'],
P
popcor255 已提交
414
            'tags': get_tags(sync_config),
M
michaelawyu 已提交
415 416 417 418 419
            'archive': sync_config['archive']
        })
    return component_versions


P
popcor255 已提交
420 421 422
def create_resource(dest_prefix, file, versions):
    """ create site resource based on the version and file """
    resource_template = jinja_env.get_template(f'{file}.template')
A
Andrea Frittoli 已提交
423
    if file.endswith(".js"):
P
popcor255 已提交
424 425
        serialize = json.dumps(versions)
        resource = resource_template.render(component_versions_json=serialize)
A
Andrea Frittoli 已提交
426
    elif file.endswith(".md"):
P
popcor255 已提交
427
        resource = resource_template.render(component_versions=versions)
A
Andrea Frittoli 已提交
428 429 430
    else:
        logging.warning(f'Cannot create resource for {file}. Only .js and .md supported')
        return
P
popcor255 已提交
431 432 433 434

    with open(f'{dest_prefix}/{file}', 'w') as f:
        f.write(resource)

435

A
Andrea Frittoli 已提交
436
def clone_repo(repo, update):
437 438 439 440
    project = repo.split('/')[-1]
    clone_dir = os.path.join(DEFAULT_CACHE_FOLDER, project)

    if os.path.isdir(clone_dir):
A
Andrea Frittoli 已提交
441 442 443 444 445 446 447 448 449 450 451
        if not update:
            print(f'{project}: Cache folder {clone_dir} found, skipping clone.')
            return repo, git.Repo(clone_dir)
        # Cleanup and update via fetch --all
        print(f'{project}: updating started')
        cloned_repo = git.Repo(clone_dir)
        cloned_repo.git.reset('--hard')
        cloned_repo.git.clean('-xdf')
        cloned_repo.git.fetch('--all')
        print(f'{project}: updating completed')
        return repo, cloned_repo
452 453 454 455 456 457 458 459

    # Clone the repo
    print(f'{project}: cloning started')
    cloned_repo = git.Repo.clone_from(repo, clone_dir)
    print(f'{project}: cloning completed')
    return repo, cloned_repo


A
Andrea Frittoli 已提交
460
def clone_repos(sync_configs, update):
461 462 463 464
    # Make sure the cache folder exists
    safe_makedirs(DEFAULT_CACHE_FOLDER)

    with Pool() as pool:
A
Andrea Frittoli 已提交
465
        results = pool.starmap(clone_repo, [(x['repository'], update) for x in sync_configs])
466 467 468
    return {x: y for x, y in results}


469 470 471
@click.command()
@click.option('--config-folder', default=DEFAULT_CONFIG_FOLDER,
              help='the folder that contains the config files')
A
Andrea Frittoli 已提交
472 473 474
@click.option('--update-cache/--no-update-cache', default=False,
              help='update clone caches. !! This will force cleanup caches !!')
def sync(config_folder, update_cache):
P
popcor255 已提交
475 476
    """ fetch all the files and sync it to the website """
    # get the path of the urls needed
477
    config_files = get_files_in_path(config_folder, ".yaml")
478
    config = [x["content"] for x in load_config(config_files)]
479
    # clone all relevant repos
A
Andrea Frittoli 已提交
480
    clones = clone_repos(config, update_cache)
481 482
    # download resources from the clone cache
    download_resources_to_project(config, clones)
483
    versions = get_versions(config)
P
popcor255 已提交
484
    # create version switcher script
485
    create_resource(JS_ASSET_DIR, "version-switcher.js", versions)
A
Andrea Frittoli 已提交
486
    # create index for vault
487
    create_resource(VAULT_DIR, FOLDER_INDEX, versions)
P
popcor255 已提交
488

B
Billy Lynch 已提交
489 490

if __name__ == '__main__':
491
    sync()