__main__.py 4.1 KB
Newer Older
W
init  
wizardforcel 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

from urllib.parse import urljoin
import sys
import json
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config

def get_toc_from_cfg():
    if config['list'] and len(config['list']) > 0:
        return config['list']
        
    if not config['url']:
        print('URL not specified')
        sys.exit()
        
    html = request_retry(
        'GET', config['url'],
        retry=config['retry'],
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
W
wizardforcel 已提交
32
    ).content.decode(config['encoding'], 'ignore')
W
init  
wizardforcel 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
    return get_toc(html, config['url'])
    
def get_toc(html, base):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_links = root(config['link'])
    vis = set()
    
    res = []
    for i in range(len(el_links)):
        el_link = el_links.eq(i)
        url = el_link.attr('href')
        if not url:
            text = el_link.text().strip()
W
wizardforcel 已提交
50
            res.append(text)
W
init  
wizardforcel 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
            continue
            
        url = re.sub(r'#.*$', '', url)
        if base:
            url = urljoin(base, url)
        if not url.startswith('http'):
            continue
        if url in vis: continue
        vis.add(url)
        res.append(url)
        
    return res
    
def get_article(html, url):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_title = root(config['title']).eq(0)
    title = el_title.text().strip()
    el_title.remove()
    
    el_co = root(config['content'])
    co = '\r\n'.join([
        el_co.eq(i).html()
        for i in range(len(el_co))
    ])
    
    if config['credit']:
        credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
        co = credit + co
        
    return {'title': title, 'content': co}
    
def tr_download_page(url, art, imgs):
    try:
        html = request_retry(
            'GET', url,
            retry=config['retry'],
            headers=config['headers'],
            timeout=config['timeout'],
            proxies=config['proxy'],
W
wizardforcel 已提交
94
        ).content.decode(config['encoding'], 'ignore')
W
init  
wizardforcel 已提交
95 96 97 98 99 100 101 102 103 104 105
        art.update(get_article(html, url))
        art['content'] = process_img(
            art['content'], imgs,
            page_url=url,
            img_prefix='../Images/',
        )
        time.sleep(config['wait'])
    except Exception as ex:
        print(ex)

def main():
W
wizardforcel 已提交
106 107
    global get_toc
    global get_article
W
init  
wizardforcel 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124

    cfg_fname = sys.argv[1] \
        if len(sys.argv) > 1 \
        else 'config.json'
    if not path.exists(cfg_fname):
        print('please provide config file')
        return
        
    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    config.update(user_cfg)
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
W
wizardforcel 已提交
125 126 127 128
    if config['external']:
        mod = load_module(config['external'])
        get_toc = getattr(mod, 'get_toc', get_toc)
        get_article = getattr(mod, 'get_article', get_article)
W
init  
wizardforcel 已提交
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
    
    toc = get_toc_from_cfg()
    articles = []
    imgs = {}
    if config['name']:
        articles.append({
            'title': config['name'],
            'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
        })
    
    text_pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in toc:
        print(f'page: {url}')
        if url.startswith('http'):
            art = {}
            articles.append(art)
            hdl = text_pool.submit(tr_download_page, url, art, imgs)
            hdls.append(hdl)
        else:
            articles.append({'title': url, 'content': ''})
        
    for h in hdls: h.result()
W
wizardforcel 已提交
152
    articles = [art for art in articles if art]
W
init  
wizardforcel 已提交
153 154 155 156 157 158
            
    gen_epub(articles, imgs)
    print('done...')
    
if __name__ == '__main__': main()