__main__.py 5.3 KB
Newer Older
W
init  
wizardforcel 已提交
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

from urllib.parse import urljoin
import sys
import json
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
W
wizardforcel 已提交
12
import hashlib
W
wizardforcel 已提交
13
from readability import Document
W
init  
wizardforcel 已提交
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config

def get_toc_from_cfg():
    if config['list'] and len(config['list']) > 0:
        return config['list']
        
    if not config['url']:
        print('URL not specified')
        sys.exit()
        
    html = request_retry(
        'GET', config['url'],
        retry=config['retry'],
W
wizardforcel 已提交
31
        check_status=config['checkStatus'],
W
init  
wizardforcel 已提交
32 33 34
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
W
wizardforcel 已提交
35
    ).content.decode(config['encoding'], 'ignore')
W
init  
wizardforcel 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    return get_toc(html, config['url'])
    
def get_toc(html, base):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_links = root(config['link'])
    vis = set()
    
    res = []
    for i in range(len(el_links)):
        el_link = el_links.eq(i)
        url = el_link.attr('href')
        if not url:
            text = el_link.text().strip()
W
wizardforcel 已提交
53
            res.append(text)
W
init  
wizardforcel 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67
            continue
            
        url = re.sub(r'#.*$', '', url)
        if base:
            url = urljoin(base, url)
        if not url.startswith('http'):
            continue
        if url in vis: continue
        vis.add(url)
        res.append(url)
        
    return res
    
def get_article(html, url):
W
wizardforcel 已提交
68 69 70
    # 预处理掉 XML 声明和命名空间
    html = re.sub(r'<\?xml[^>]*\?>', '', html)
    html = re.sub(r'xmlns=".+?"', '', html)
W
init  
wizardforcel 已提交
71 72 73 74 75 76 77 78 79
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_title = root(config['title']).eq(0)
    title = el_title.text().strip()
    el_title.remove()
    
W
wizardforcel 已提交
80 81 82 83 84 85 86 87 88
    if config['content']:
        el_co = root(config['content'])
        co = '\r\n'.join([
            el_co.eq(i).html()
            for i in range(len(el_co))
        ])
    else:
        co = Document(str(root)).summary()
        co = pq(co).find('body').html()
W
init  
wizardforcel 已提交
89 90 91 92 93 94 95
    
    if config['credit']:
        credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
        co = credit + co
        
    return {'title': title, 'content': co}
    
W
wizardforcel 已提交
96
def tr_download_page_safe(url, art, imgs):
W
init  
wizardforcel 已提交
97
    try:
W
wizardforcel 已提交
98 99
        tr_download_page(url, art, imgs)
    except Exception as ex:
W
wizardforcel 已提交
100
        print(f'{url} 下载失败:{ex}')
W
wizardforcel 已提交
101 102 103 104

def tr_download_page(url, art, imgs):
    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache = load_article(hash)
W
wizardforcel 已提交
105
    if cache is not None and config['cache']:
W
wizardforcel 已提交
106
        print(f'{url} 已存在于本地缓存中')
W
wizardforcel 已提交
107
        art.update(cache)
W
wizardforcel 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
        art['content'] = process_img(
            art['content'], imgs,
            page_url=url,
            img_prefix='../Images/',
        )
        return
    
    html = request_retry(
        'GET', url,
        retry=config['retry'],
        check_status=config['checkStatus'],
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
    ).content.decode(config['encoding'], 'ignore')
    print(f'{url} 下载成功')
    art.update(get_article(html, url))
    save_article(hash, art)
W
wizardforcel 已提交
126 127 128 129 130 131 132
    art['content'] = process_img(
        art['content'], imgs,
        page_url=url,
        img_prefix='../Images/',
    )
    time.sleep(config['wait'])
    
W
init  
wizardforcel 已提交
133

W
wizardforcel 已提交
134
def update_config(user_cfg):
W
wizardforcel 已提交
135 136
    global get_toc
    global get_article
W
wizardforcel 已提交
137
    
W
init  
wizardforcel 已提交
138
    config.update(user_cfg)
W
wizardforcel 已提交
139
    
W
wizardforcel 已提交
140 141 142
    if not config['title']:
        config['title'] = 'title'
    
W
init  
wizardforcel 已提交
143 144 145 146 147 148
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
W
wizardforcel 已提交
149
    
W
init  
wizardforcel 已提交
150
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
W
wizardforcel 已提交
151
    
W
wizardforcel 已提交
152 153 154 155
    if config['external']:
        mod = load_module(config['external'])
        get_toc = getattr(mod, 'get_toc', get_toc)
        get_article = getattr(mod, 'get_article', get_article)
W
wizardforcel 已提交
156 157 158 159 160 161
        
    if not config['timeout']:
        config['timeout'] = (
            config['connTimeout'],
            config['readTimeout'],
        )
W
wizardforcel 已提交
162 163 164 165 166 167 168 169 170 171 172

def main():
    cfg_fname = sys.argv[1] \
        if len(sys.argv) > 1 \
        else 'config.json'
    if not path.exists(cfg_fname):
        print('please provide config file')
        return
        
    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    update_config(user_cfg)
W
init  
wizardforcel 已提交
173 174 175 176 177 178 179 180 181 182 183 184 185 186
    
    toc = get_toc_from_cfg()
    articles = []
    imgs = {}
    if config['name']:
        articles.append({
            'title': config['name'],
            'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
        })
    
    text_pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in toc:
        print(f'page: {url}')
W
wizardforcel 已提交
187
        if not re.search(r'^https?://', url):
W
init  
wizardforcel 已提交
188
            articles.append({'title': url, 'content': ''})
W
wizardforcel 已提交
189 190 191 192
            continue
        
        art = {}
        articles.append(art)
W
wizardforcel 已提交
193
        hdl = text_pool.submit(tr_download_page_safe, url, art, imgs)
W
wizardforcel 已提交
194 195
        hdls.append(hdl)
            
W
init  
wizardforcel 已提交
196 197
        
    for h in hdls: h.result()
W
wizardforcel 已提交
198
    articles = [art for art in articles if art]
W
init  
wizardforcel 已提交
199 200 201 202 203 204
            
    gen_epub(articles, imgs)
    print('done...')
    
if __name__ == '__main__': main()