__main__.py 4.6 KB
Newer Older
W
init  
wizardforcel 已提交
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

from urllib.parse import urljoin
import sys
import json
from pyquery import PyQuery as pq
import time
from os import path
import re
from concurrent.futures import ThreadPoolExecutor
W
wizardforcel 已提交
12
import hashlib
W
wizardforcel 已提交
13
from readability import Document
W
init  
wizardforcel 已提交
14 15 16 17 18
from GenEpub import gen_epub
from . import *
from .util import *
from .img import *
from .config import config
W
wizardforcel 已提交
19
from .sele_crawler import crawl_sele
W
wizardforcel 已提交
20
from .common import *
W
init  
wizardforcel 已提交
21 22 23 24 25 26 27 28 29 30 31 32

def get_toc_from_cfg():
    if config['list'] and len(config['list']) > 0:
        return config['list']
        
    if not config['url']:
        print('URL not specified')
        sys.exit()
        
    html = request_retry(
        'GET', config['url'],
        retry=config['retry'],
W
wizardforcel 已提交
33
        check_status=config['checkStatus'],
W
init  
wizardforcel 已提交
34 35 36
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
W
wizardforcel 已提交
37
    ).content.decode(config['encoding'], 'ignore')
W
init  
wizardforcel 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    return get_toc(html, config['url'])
    
def get_toc(html, base):
    root = pq(html)
    
    if config['remove']:
        root(config['remove']).remove()
        
    el_links = root(config['link'])
    vis = set()
    
    res = []
    for i in range(len(el_links)):
        el_link = el_links.eq(i)
        url = el_link.attr('href')
        if not url:
            text = el_link.text().strip()
W
wizardforcel 已提交
55
            res.append(text)
W
init  
wizardforcel 已提交
56 57 58 59 60 61 62 63 64 65 66 67
            continue
            
        url = re.sub(r'#.*$', '', url)
        if base:
            url = urljoin(base, url)
        if not url.startswith('http'):
            continue
        if url in vis: continue
        vis.add(url)
        res.append(url)
        
    return res
W
wizardforcel 已提交
68

W
init  
wizardforcel 已提交
69
    
W
wizardforcel 已提交
70
def tr_download_page_safe(url, art, imgs):
W
init  
wizardforcel 已提交
71
    try:
W
wizardforcel 已提交
72 73
        tr_download_page(url, art, imgs)
    except Exception as ex:
W
wizardforcel 已提交
74
        print(f'{url} 下载失败:{ex}')
W
wizardforcel 已提交
75 76 77 78

def tr_download_page(url, art, imgs):
    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache = load_article(hash)
W
wizardforcel 已提交
79
    if cache is not None and config['cache']:
W
wizardforcel 已提交
80
        print(f'{url} 已存在于本地缓存中')
W
wizardforcel 已提交
81
        art.update(cache)
W
wizardforcel 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
        art['content'] = process_img(
            art['content'], imgs,
            page_url=url,
            img_prefix='../Images/',
        )
        return
    
    html = request_retry(
        'GET', url,
        retry=config['retry'],
        check_status=config['checkStatus'],
        headers=config['headers'],
        timeout=config['timeout'],
        proxies=config['proxy'],
    ).content.decode(config['encoding'], 'ignore')
    print(f'{url} 下载成功')
    art.update(get_article(html, url))
    save_article(hash, art)
W
wizardforcel 已提交
100 101 102 103 104 105 106
    art['content'] = process_img(
        art['content'], imgs,
        page_url=url,
        img_prefix='../Images/',
    )
    time.sleep(config['wait'])
    
W
init  
wizardforcel 已提交
107

W
wizardforcel 已提交
108
def update_config(user_cfg):
W
wizardforcel 已提交
109 110
    global get_toc
    global get_article
W
wizardforcel 已提交
111
    
W
init  
wizardforcel 已提交
112
    config.update(user_cfg)
W
wizardforcel 已提交
113
    
W
wizardforcel 已提交
114 115 116
    if not config['title']:
        config['title'] = 'title'
    
W
init  
wizardforcel 已提交
117 118 119 120 121 122
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
W
wizardforcel 已提交
123
    
W
init  
wizardforcel 已提交
124
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
W
wizardforcel 已提交
125
    
W
wizardforcel 已提交
126 127 128 129
    if config['external']:
        mod = load_module(config['external'])
        get_toc = getattr(mod, 'get_toc', get_toc)
        get_article = getattr(mod, 'get_article', get_article)
W
wizardforcel 已提交
130 131 132 133 134 135
        
    if not config['timeout']:
        config['timeout'] = (
            config['connTimeout'],
            config['readTimeout'],
        )
W
wizardforcel 已提交
136 137 138 139 140 141 142 143 144 145 146

def main():
    cfg_fname = sys.argv[1] \
        if len(sys.argv) > 1 \
        else 'config.json'
    if not path.exists(cfg_fname):
        print('please provide config file')
        return
        
    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    update_config(user_cfg)
W
init  
wizardforcel 已提交
147
    
W
wizardforcel 已提交
148
    if config['selenium']: 
W
wizardforcel 已提交
149
        crawl_sele()
W
wizardforcel 已提交
150
        return
W
wizardforcel 已提交
151
    
W
init  
wizardforcel 已提交
152 153 154 155 156 157 158 159 160 161 162 163 164
    toc = get_toc_from_cfg()
    articles = []
    imgs = {}
    if config['name']:
        articles.append({
            'title': config['name'],
            'content': f"<p>来源:<a href='{config['url']}'>{config['url']}</a></p>",
        })
    
    text_pool = ThreadPoolExecutor(config['textThreads'])
    hdls = []
    for url in toc:
        print(f'page: {url}')
W
wizardforcel 已提交
165
        if not re.search(r'^https?://', url):
W
init  
wizardforcel 已提交
166
            articles.append({'title': url, 'content': ''})
W
wizardforcel 已提交
167 168 169 170
            continue
        
        art = {}
        articles.append(art)
W
wizardforcel 已提交
171
        hdl = text_pool.submit(tr_download_page_safe, url, art, imgs)
W
wizardforcel 已提交
172 173
        hdls.append(hdl)
            
W
init  
wizardforcel 已提交
174 175
        
    for h in hdls: h.result()
W
wizardforcel 已提交
176
    articles = [art for art in articles if art]
W
init  
wizardforcel 已提交
177 178 179 180 181 182
            
    gen_epub(articles, imgs)
    print('done...')
    
if __name__ == '__main__': main()