md-split.py 7.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#! /usr/bin/env python

# A script that splits a Markdown file into plain text (for spell checking) and c++ files.


from __future__ import absolute_import, print_function, unicode_literals

import os
import shutil
import io
import argparse

import re, cgi
TAG_REGEX = re.compile(r'(<!--.*?-->|<[^>]*>)')
NAMED_A_TAG_REGEX = re.compile(r'.*name ?= ?"([^"]*)"')

def main():
    """
    This script ended up ugly, so in case somebody wants to reimplement, here is the spec that grew by time.

21 22
    What it should do it take a markdown file, and split it into more files. A targetfile should have the same
    number of lines as the original, with source code snippets and markdown non-words removed, for spell-checking.
23 24 25

    Each code snipped should go into a separate file in codedir.

26 27 28
    Each code snipped should get additional C++ code around it to help compile the line in context, with
    some heuristic guessing of what is needed around. The wrapping code should have a token in each line allowing
    other tools to filter out these lines
29 30 31

    The name for each file chosen consists os the section id in the markdown document, a counter for the snippet inside the section.

32 33
    Snippets without code (only comments) or containing lines starting with ??? should not yeld files,
    but the counter for naming snippets should still increment.
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
    """
    parser = argparse.ArgumentParser(description='Split md file into plain text and code blocks')
    parser.add_argument('sourcefile',
                        help='which file to read')
    parser.add_argument('targetfile',
                        help='where to put plain text')
    parser.add_argument('codedir',
                        help='where to put codeblocks')
    args = parser.parse_args()

    # ensure folder exists
    if not os.path.exists(args.codedir):
        os.makedirs(args.codedir)


    if os.path.exists(args.targetfile):
        os.remove(args.targetfile)

    code_block_index = 0
    last_header = ''
    linenum = 0
    with io.open(args.sourcefile, 'r') as read_filehandle:
        with io.open(args.targetfile, 'w') as text_filehandle:
            for line in read_filehandle:
                linenum += 1
                indent_depth = is_code(line)
                if indent_depth:
                    (line, linenum) = process_code(read_filehandle,
                                                    text_filehandle,
                                                    line, linenum,
                                                    args.sourcefile, args.codedir,
                                                    last_header, code_block_index,
                                                    indent_depth)
                    code_block_index += 1
                # reach here either line was not code, or was code
                # and we dealt with n code lines
70
                if indent_depth < 4 or not is_code(line, indent_depth):
71 72 73 74 75 76 77 78
                    # store header id for codeblock
                    section_id = get_marker(line)
                    if section_id is not None:
                        code_block_index = 0
                        last_header = section_id
                    sline = stripped(line)
                    text_filehandle.write(sline)

79 80
    assert line_length(args.sourcefile) == line_length(args.targetfile)

81 82 83 84 85

def process_code(read_filehandle, text_filehandle, line, linenum, sourcefile, codedir, name, index, indent_depth):
    fenced = (line.strip() == '```')
    if fenced:
        try:
86
            line = read_filehandle.readLine()
87
            linenum += 1
88
            text_filehandle.write('\n')
89 90 91 92 93 94 95 96 97 98 99
        except StopIteration:
            return ('', linenum)
    start_linenum = linenum
    has_actual_code = False
    has_question_marks = False
    linebuffer = []
    while ((fenced and line.strip() != '```') or (not fenced and is_inside_code(line, indent_depth))):
        # copy comments to plain text for spell check
        comment_idx = line.find('//')
        no_comment_line = line
        if comment_idx >= 0:
100
            no_comment_line = line[:comment_idx].strip()
101
            text_filehandle.write(line[comment_idx + 2:])
102 103 104 105
        else:
            # write empty line so line numbers stay stable
            text_filehandle.write('\n')

106 107 108
        if (not has_actual_code
            and not line.strip().startswith('//')
            and not line.strip().startswith('???')
T
Thibault Kruse 已提交
109
            and not line.strip() == ''):
110
            has_actual_code = True
111 112

        if (not line.strip() == '```'):
113
            if ('???' == no_comment_line or '...' == no_comment_line):
114
                has_question_marks = True
115
            linebuffer.append(dedent(line, indent_depth) if not fenced else line)
116
        try:
S
Sergey Zubkov 已提交
117
            line = read_filehandle.readline()
118 119 120 121 122 123
            linenum += 1
        except StopIteration:
            line = ''
            break
    codefile = os.path.join(codedir, '%s%s.cpp' % (name, index))
    if fenced:
124
        text_filehandle.write('\n')
125

126
    if (has_actual_code and not has_question_marks):
127
        linebuffer = clean_trailing_newlines(linebuffer)
128 129 130 131
        write_with_harness(codefile, sourcefile, start_linenum, linebuffer)
    return (line, linenum)


132 133 134 135 136 137 138 139 140 141 142 143 144
def clean_trailing_newlines(linebuffer):
    result = []
    code_started = False
    linebuffer.reverse()
    for line in linebuffer:
        if not code_started and line == '\n':
            continue
        code_started = True
        result.append(line)
    result.reverse()
    return result


145 146 147 148 149 150 151
def write_with_harness(codefile, sourcefile, start_linenum, linebuffer):
    '''write output with additional lines to make code likely compilable'''
    # add commonly used headers, so that lines can likely compile.
    # This is work in progress, the main issue remains handling class
    # declarations in in-function code differently
    with io.open(codefile, 'w') as code_filehandle:
        code_filehandle.write('''\
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
#include<stdio.h>      // by md-split
#include<stdlib.h>     // by md-split
#include<tuple>        // by md-split
#include<utility>      // by md-split
#include<limits>       // by md-split
#include<functional>   // by md-split
#include<string>       // by md-split
#include<map>          // by md-split
#include<iostream>     // by md-split
#include<vector>       // by md-split
#include<algorithm>    // by md-split
#include<memory>       // by md-split
using namespace std;   // by md-split
// %s : %s
''' % (sourcefile, start_linenum))
167 168 169
        # TODO: if not toplevel code, wrap inside class
        for codeline in linebuffer:
            code_filehandle.write(codeline)
170 171 172


def is_code(line, indent_depth = 4):
173
    '''returns the indent depth, 0 means not code in markup'''
174 175 176 177 178
    if line.startswith(' ' * indent_depth):
        return len(line) - len(line.lstrip(' '))
    return 0

def is_inside_code(line, indent_depth):
179
    return is_code(line, indent_depth) > 0 or line.strip() == ''
180 181 182 183 184 185 186

def stripped(line):
    # Remove well-formed html tags, fixing mistakes by legitimate users
    sline = TAG_REGEX.sub('', line)
    sline = re.sub('[()\[\]#*]', ' ', line)
    return sline

187 188 189
def dedent(line, indent_depth):
    if line.startswith(' ' * indent_depth):
        return line[indent_depth:]
190 191 192 193 194 195 196 197 198 199
    if line.startswith('\t'):
        return line[1:]
    return line

def get_marker(line):
    matchlist = TAG_REGEX.findall(line)
    if matchlist:
        namematch = NAMED_A_TAG_REGEX.match(line)
        if namematch:
            return namematch.group(1) # group 0 is full match
200

201 202
    return None

203 204 205
def line_length(filename):
    return sum(1 for line in open(filename))

206 207
if __name__ == '__main__':
    main()