__main__.py 5.0 KB
Newer Older
W
init  
wizardforcel 已提交
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

from urllib.parse import urljoin
import sys
import json
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
W
wizardforcel 已提交
12
import hashlib
W
init  
wizardforcel 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config

def get_toc_from_cfg():
    if config['list'] and len(config['list']) > 0:
        return config['list']
        
    if not config['url']:
        print('URL not specified')
        sys.exit()
        
    html = request_retry(
        'GET', config['url'],
        retry=config['retry'],
W
wizardforcel 已提交
30
        check_status=config['checkStatus'],
W
init  
wizardforcel 已提交
31 32 33
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
W
wizardforcel 已提交
34
    ).content.decode(config['encoding'], 'ignore')
W
init  
wizardforcel 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
    return get_toc(html, config['url'])
    
def get_toc(html, base):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_links = root(config['link'])
    vis = set()
    
    res = []
    for i in range(len(el_links)):
        el_link = el_links.eq(i)
        url = el_link.attr('href')
        if not url:
            text = el_link.text().strip()
W
wizardforcel 已提交
52
            res.append(text)
W
init  
wizardforcel 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
            continue
            
        url = re.sub(r'#.*$', '', url)
        if base:
            url = urljoin(base, url)
        if not url.startswith('http'):
            continue
        if url in vis: continue
        vis.add(url)
        res.append(url)
        
    return res
    
def get_article(html, url):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_title = root(config['title']).eq(0)
    title = el_title.text().strip()
    el_title.remove()
    
    el_co = root(config['content'])
    co = '\r\n'.join([
        el_co.eq(i).html()
        for i in range(len(el_co))
    ])
    
    if config['credit']:
        credit = f"<blockquote>原文:<a href='{url}'>{url}</a></blockquote>"
        co = credit + co
        
    return {'title': title, 'content': co}
    
W
wizardforcel 已提交
88
def tr_download_page_safe(url, art, imgs):
W
init  
wizardforcel 已提交
89
    try:
W
wizardforcel 已提交
90 91
        tr_download_page(url, art, imgs)
    except Exception as ex:
W
wizardforcel 已提交
92
        print(f'{url} 下载失败:{ex}')
W
wizardforcel 已提交
93 94 95 96

def tr_download_page(url, art, imgs):
    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache = load_article(hash)
W
wizardforcel 已提交
97
    if cache is not None and config['cache']:
W
wizardforcel 已提交
98
        print(f'{url} 已存在于本地缓存中')
W
wizardforcel 已提交
99
        art.update(cache)
W
wizardforcel 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
        art['content'] = process_img(
            art['content'], imgs,
            page_url=url,
            img_prefix='../Images/',
        )
        return
    
    html = request_retry(
        'GET', url,
        retry=config['retry'],
        check_status=config['checkStatus'],
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
    ).content.decode(config['encoding'], 'ignore')
    print(f'{url} 下载成功')
    art.update(get_article(html, url))
    save_article(hash, art)
W
wizardforcel 已提交
118 119 120 121 122 123 124
    art['content'] = process_img(
        art['content'], imgs,
        page_url=url,
        img_prefix='../Images/',
    )
    time.sleep(config['wait'])
    
W
init  
wizardforcel 已提交
125

W
wizardforcel 已提交
126
def update_config(user_cfg):
W
wizardforcel 已提交
127 128
    global get_toc
    global get_article
W
wizardforcel 已提交
129
    
W
init  
wizardforcel 已提交
130
    config.update(user_cfg)
W
wizardforcel 已提交
131
    
W
init  
wizardforcel 已提交
132 133 134 135 136 137
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
W
wizardforcel 已提交
138
    
W
init  
wizardforcel 已提交
139
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
W
wizardforcel 已提交
140
    
W
wizardforcel 已提交
141 142 143 144
    if config['external']:
        mod = load_module(config['external'])
        get_toc = getattr(mod, 'get_toc', get_toc)
        get_article = getattr(mod, 'get_article', get_article)
W
wizardforcel 已提交
145 146 147 148 149 150
        
    if not config['timeout']:
        config['timeout'] = (
            config['connTimeout'],
            config['readTimeout'],
        )
W
wizardforcel 已提交
151 152 153 154 155 156 157 158 159 160 161

def main():
    cfg_fname = sys.argv[1] \
        if len(sys.argv) > 1 \
        else 'config.json'
    if not path.exists(cfg_fname):
        print('please provide config file')
        return
        
    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    update_config(user_cfg)
W
init  
wizardforcel 已提交
162 163 164 165 166 167 168 169 170 171 172 173 174 175
    
    toc = get_toc_from_cfg()
    articles = []
    imgs = {}
    if config['name']:
        articles.append({
            'title': config['name'],
            'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
        })
    
    text_pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in toc:
        print(f'page: {url}')
W
wizardforcel 已提交
176
        if not re.search(r'^https://', url):
W
init  
wizardforcel 已提交
177
            articles.append({'title': url, 'content': ''})
W
wizardforcel 已提交
178 179 180 181
            continue
        
        art = {}
        articles.append(art)
W
wizardforcel 已提交
182
        hdl = text_pool.submit(tr_download_page_safe, url, art, imgs)
W
wizardforcel 已提交
183 184
        hdls.append(hdl)
            
W
init  
wizardforcel 已提交
185 186
        
    for h in hdls: h.result()
W
wizardforcel 已提交
187
    articles = [art for art in articles if art]
W
init  
wizardforcel 已提交
188 189 190 191 192 193
            
    gen_epub(articles, imgs)
    print('done...')
    
if __name__ == '__main__': main()