gh-md-toc 10.6 KB
Newer Older
李启龙 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
#!/usr/bin/env bash

#
# Steps:
#
#  1. Download corresponding html file for some README.md:
#       curl -s $1
#
#  2. Discard rows where no substring 'user-content-' (github's markup):
#       awk '/user-content-/ { ...
#
#  3.1 Get last number in each row like ' ... </span></a>sitemap.js</h1'.
#      It's a level of the current header:
#       substr($0, length($0), 1)
#
#  3.2 Get level from 3.1 and insert corresponding number of spaces before '*':
#       sprintf("%*s", substr($0, length($0), 1)*3, " ")
#
#  4. Find head's text and insert it inside "* [ ... ]":
#       substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
#
#  5. Find anchor and insert it inside "(...)":
#       substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
#

gh_toc_version="0.7.0"

gh_user_agent="gh-md-toc v$gh_toc_version"

#
# Download rendered into html README.md by its url.
#
#
gh_toc_load() {
    local gh_url=$1

    if type curl &>/dev/null; then
        curl --user-agent "$gh_user_agent" -s "$gh_url"
    elif type wget &>/dev/null; then
        wget --user-agent="$gh_user_agent" -qO- "$gh_url"
    else
        echo "Please, install 'curl' or 'wget' and try again."
        exit 1
    fi
}

#
# Converts local md file into html by GitHub
#
# -> curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
# <p>Hello world github/linguist#1 <strong>cool</strong>, and #1!</p>'"
gh_toc_md2html() {
    local gh_file_md=$1
    URL=https://api.github.com/markdown/raw

    if [ ! -z "$GH_TOC_TOKEN" ]; then
        TOKEN=$GH_TOC_TOKEN
    else
        TOKEN_FILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
        if [ -f "$TOKEN_FILE" ]; then
            TOKEN="$(cat $TOKEN_FILE)"
        fi
    fi
    if [ ! -z "${TOKEN}" ]; then
        AUTHORIZATION="Authorization: token ${TOKEN}"
    fi

    # echo $URL 1>&2
    OUTPUT=$(curl -s \
        --user-agent "$gh_user_agent" \
        --data-binary @"$gh_file_md" \
        -H "Content-Type:text/plain" \
        -H "$AUTHORIZATION" \
        "$URL")

    if [ "$?" != "0" ]; then
        echo "XXNetworkErrorXX"
    fi
    if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then
        echo "XXRateLimitXX"
    else
        echo "${OUTPUT}"
    fi
}


#
# Is passed string url
#
gh_is_url() {
    case $1 in
        https* | http*)
            echo "yes";;
        *)
            echo "no";;
    esac
}

#
# TOC generator
#
gh_toc(){
    local gh_src=$1
    local gh_src_copy=$1
    local gh_ttl_docs=$2
    local need_replace=$3
    local no_backup=$4
    local no_footer=$5

    if [ "$gh_src" = "" ]; then
        echo "Please, enter URL or local path for a README.md"
        exit 1
    fi


    # Show "TOC" string only if working with one document
    if [ "$gh_ttl_docs" = "1" ]; then

        echo "Table of Contents"
        echo "================="
        echo ""
        gh_src_copy=""

    fi

    if [ "$(gh_is_url "$gh_src")" == "yes" ]; then
        gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy"
        if [ "${PIPESTATUS[0]}" != "0" ]; then
            echo "Could not load remote document."
            echo "Please check your url or network connectivity"
            exit 1
        fi
        if [ "$need_replace" = "yes" ]; then
            echo
            echo "!! '$gh_src' is not a local file"
            echo "!! Can't insert the TOC into it."
            echo
        fi
    else
        local rawhtml=$(gh_toc_md2html "$gh_src")
        if [ "$rawhtml" == "XXNetworkErrorXX" ]; then
             echo "Parsing local markdown file requires access to github API"
             echo "Please make sure curl is installed and check your network connectivity"
             exit 1
        fi
        if [ "$rawhtml" == "XXRateLimitXX" ]; then
             echo "Parsing local markdown file requires access to github API"
             echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting"
             TOKEN_FILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
             echo "or place GitHub auth token here: ${TOKEN_FILE}"
             exit 1
        fi
        local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy"`
        echo "$toc"
        if [ "$need_replace" = "yes" ]; then
            if grep -Fxq "<!--ts-->" $gh_src && grep -Fxq "<!--te-->" $gh_src; then
                echo "Found markers"
            else
                echo "You don't have <!--ts--> or <!--te--> in your file...exiting"
                exit 1
            fi
            local ts="<\!--ts-->"
            local te="<\!--te-->"
            local dt=`date +'%F_%H%M%S'`
            local ext=".orig.${dt}"
            local toc_path="${gh_src}.toc.${dt}"
            local toc_footer="<!-- Added by: `whoami`, at: `date` -->"
            # http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html
            # clear old TOC
            sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src"
            # create toc file
            echo "${toc}" > "${toc_path}"
            if [ "${no_footer}" != "yes" ]; then
                echo -e "\n${toc_footer}\n" >> "$toc_path"
            fi

            # insert toc file
            if [[ "`uname`" == "Darwin" ]]; then
                sed -i "" "/${ts}/r ${toc_path}" "$gh_src"
            else
                sed -i "/${ts}/r ${toc_path}" "$gh_src"
            fi
            echo
            if [ "${no_backup}" = "yes" ]; then
                rm ${toc_path} ${gh_src}${ext}
            fi
            echo "!! TOC was added into: '$gh_src'"
            if [ -z "${no_backup}" ]; then
                echo "!! Origin version of the file: '${gh_src}${ext}'"
                echo "!! TOC added into a separate file: '${toc_path}'"
        fi
            echo
        fi
    fi
}

#
# Grabber of the TOC from rendered html
#
# $1 - a source url of document.
# It's need if TOC is generated for multiple documents.
#
gh_toc_grab() {
    common_awk_script='
                     modified_href = ""
                     split(href, chars, "")
                     for (i=1;i <= length(href); i++) {
                         c = chars[i]
                         res = ""
                         if (c == "+") {
                             res = " "
                         } else {
                             if (c == "%") {
                                 res = "\\x"
                             } else {
                                 res = c ""
                             }
                         }
                         modified_href = modified_href res
                    }
                    print sprintf("%*s", (level-1)*3, "") "* [" text "](" gh_url  modified_href ")"
                    '
    if [ `uname -s` == "OS/390" ]; then
        grepcmd="pcregrep -o"
        echoargs=""
        awkscript='{
                     level = substr($0, length($0), 1)
                     text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
                     href = substr($0, match($0, "href=\"([^\"]+)?\"")+6, RLENGTH-7)
                     '"$common_awk_script"'
                }'
    else
        grepcmd="grep -Eo"
        echoargs="-e"
        awkscript='{
                     level = substr($0, length($0), 1)
                     text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
                     href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7)
                     '"$common_awk_script"'
                }'
    fi
    href_regex='href=\"[^\"]+?\"'

    # if closed <h[1-6]> is on the new line, then move it on the prev line
    # for example:
    #   was: The command <code>foo1</code>
    #        </h1>
    #   became: The command <code>foo1</code></h1>
    sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
    
    # find strings that corresponds to template
    $grepcmd '<a.*id="user-content-[^"]*".*</h[1-6]' |
    
    # remove code tags
    sed 's/<code>//g' | sed 's/<\/code>//g' |

    # remove g-emoji
    sed 's/<g-emoji[^>]*[^<]*<\/g-emoji> //g' |
    
    # now all rows are like:
    #   <a id="user-content-..." href="..."><span ...></span></a> ... </h1
    # format result line
    #   * $0 - whole string
    #   * last element of each row: "</hN" where N in (1,2,3,...)
    echo $echoargs "$(awk -v "gh_url=$1" "$awkscript")"
}

        # perl -lpE 's/(\[[^\]]*\]\()(.*?)(\))/my ($pre, $in, $post)=($1, $2, $3) ; $in =~ s{\+}{ }g; $in =~ s{%}{\\x}g; $pre.$in.$post/ems')"

#
# Returns filename only from full path or url
#
gh_toc_get_filename() {
    echo "${1##*/}"
}

#
# Options handlers
#
gh_toc_app() {
    local need_replace="no"

    if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then
        local app_name=$(basename "$0")
        echo "GitHub TOC generator ($app_name): $gh_toc_version"
        echo ""
        echo "Usage:"
        echo "  $app_name [--insert] [--hide-footer] src [src]  Create TOC for a README file (url or local path)"
        echo "  $app_name [--no-backup] [--hide-footer] src [src]  Create TOC without backup, requires <!--ts--> / <!--te--> placeholders"
        echo "  $app_name -                     Create TOC for markdown from STDIN"
        echo "  $app_name --help                Show help"
        echo "  $app_name --version             Show version"
        return
    fi

    if [ "$1" = '--version' ]; then
        echo "$gh_toc_version"
        echo
        echo "os:     `lsb_release -d | cut -f 2`"
        echo "kernel: `cat /proc/version`"
        echo "shell:  `$SHELL --version`"
        echo
        for tool in curl wget grep awk sed; do
            printf "%-5s: " $tool
            echo `$tool --version | head -n 1`
        done
        return
    fi

    if [ "$1" = "-" ]; then
        if [ -z "$TMPDIR" ]; then
            TMPDIR="/tmp"
        elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then
            mkdir -p "$TMPDIR"
        fi
        local gh_tmp_md
        if [ `uname -s` == "OS/390" ]; then
            local timestamp=$(date +%m%d%Y%H%M%S)
            gh_tmp_md="$TMPDIR/tmp.$timestamp"
        else
            gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX)
        fi
        while read input; do
            echo "$input" >> "$gh_tmp_md"
        done
        gh_toc_md2html "$gh_tmp_md" | gh_toc_grab ""
        return
    fi

    if [ "$1" = '--insert' ]; then
        need_replace="yes"
        shift
    fi

    if [ "$1" = '--no-backup' ]; then
        need_replace="yes"
        no_backup="yes"
        shift
    fi

    if [ "$1" = '--hide-footer' ]; then
        need_replace="yes"
        no_footer="yes"
        shift
    fi

    for md in "$@"
    do
        echo ""
        gh_toc "$md" "$#" "$need_replace" "$no_backup" "$no_footer"
    done

    echo ""
    echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)"
}

#
# Entry point
#
gh_toc_app "$@"