aa824632 · aa824632 · aa824632 · aa824632
Showing with 68 addition and 40 deletion

EpubCrawler/__main__.py EpubCrawler/__main__.py +42 -24

EpubCrawler/config.py EpubCrawler/config.py +4 -1

EpubCrawler/img.py EpubCrawler/img.py +16 -15

history.md history.md +6 -0

未找到文件。
--- a/EpubCrawler/__main__.py
+++ b/EpubCrawler/__main__.py
@@ -89,26 +89,32 @@ def tr_download_page_safe(url, art, imgs):
    try:
        tr_download_page(url, art, imgs)
    except Exception as ex:
-        print(ex)
+        print(f'{url} 下载失败：{ex}')
 def tr_download_page(url, art, imgs):
    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache = load_article(hash)
-    if cache is None:
+    if cache is not None and config['cache']:
-        html = request_retry(
-            'GET', url,
-            retry=config['retry'],
-            check_status=config['checkStatus'],
-            headers=config['headers'],
-            timeout=config['timeout'],
-            proxies=config['proxy'],
-        ).content.decode(config['encoding'], 'ignore')
-        art.update(get_article(html, url))
-        save_article(hash, art)
-    else:
        print(f'{url} 已存在于本地缓存中')
        art.update(cache)
+        art['content'] = process_img(
+            art['content'], imgs,
+            page_url=url,
+            img_prefix='../Images/',
+        )
+        return
+    html = request_retry(
+        'GET', url,
+        retry=config['retry'],
+        check_status=config['checkStatus'],
+        headers=config['headers'],
+        timeout=config['timeout'],
+        proxies=config['proxy'],
+    ).content.decode(config['encoding'], 'ignore')
+    print(f'{url} 下载成功')
+    art.update(get_article(html, url))
+    save_article(hash, art)
    art['content'] = process_img(
        art['content'], imgs,
        page_url=url,
@@ -117,30 +123,42 @@ def tr_download_page(url, art, imgs):
    time.sleep(config['wait'])
-def main():
+def update_config(user_cfg):
    global get_toc
    global get_article
-    cfg_fname = sys.argv[1] \
-        if len(sys.argv) > 1 \
-        else 'config.json'
-    if not path.exists(cfg_fname):
-        print('please provide config file')
-        return
-    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
    config.update(user_cfg)
    if config['proxy']:
        proxies = {
            'http': config['proxy'],
            'https': config['proxy'],
        }
        config['proxy'] = proxies
    set_img_pool(ThreadPoolExecutor(config['imgThreads']))
    if config['external']:
        mod = load_module(config['external'])
        get_toc = getattr(mod, 'get_toc', get_toc)
        get_article = getattr(mod, 'get_article', get_article)
+    if not config['timeout']:
+        config['timeout'] = (
+            config['connTimeout'],
+            config['readTimeout'],
+        )
+def main():
+    cfg_fname = sys.argv[1] \
+        if len(sys.argv) > 1 \
+        else 'config.json'
+    if not path.exists(cfg_fname):
+        print('please provide config file')
+        return
+    user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
+    update_config(user_cfg)
    toc = get_toc_from_cfg()
    articles = []

--- a/EpubCrawler/config.py
+++ b/EpubCrawler/config.py
@@ -17,11 +17,14 @@ config = {
    'list': [],
    'optiMode': 'quant',
    'colors': 8,
-    'timeout': 8,
+    'timeout': None,
+    'connTimeout': 1,
+    'readTimeout': 60,
 	'imgSrc': ['data-src', 'data-original-src', 'src'],
    'proxy': '',
    'textThreads': 5,
    'imgThreads': 5,
    'external': None,
    'checkStatus': False,
+    'cache': True,
 }
\ No newline at end of file
--- a/EpubCrawler/img.py
+++ b/EpubCrawler/img.py
@@ -29,27 +29,28 @@ def tr_download_img_safe(url, imgs, picname):
    try:
        tr_download_img(url, imgs, picname)
    except Exception as ex:
-        print(ex)
+        print(f'{url} 下载失败：{ex}')
-        imgs[picname] = b''
 def tr_download_img(url, imgs, picname):
    hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache = load_img(hash, config['optiMode'])
-    if cache is None:
+    if cache is not None and config['cache']:
-        data = request_retry(
-            'GET', url,
-            headers=config['headers'],
-            check_status=config['checkStatus'],
-            retry=config['retry'],
-            timeout=config['timeout'],
-            proxies=config['proxy'],
-        ).content
-        data = opti_img(data, config['optiMode'], config['colors']) or b''
-        save_img(hash, config['optiMode'], data)
-    else:
        print(f'{url} 已存在于本地缓存中')
-        data = cache
+        imgs[picname] = cache
+        return
+    data = request_retry(
+        'GET', url,
+        headers=config['headers'],
+        check_status=config['checkStatus'],
+        retry=config['retry'],
+        timeout=config['timeout'],
+        proxies=config['proxy'],
+    ).content
+    print(f'{url} 下载成功')
+    data = opti_img(data, config['optiMode'], config['colors']) or b''
    imgs[picname] = data
+    save_img(hash, config['optiMode'], data)
    time.sleep(config['wait'])
 def process_img_data_url(url, el_img, imgs, **kw):

--- a/history.md
+++ b/history.md
 # 历史记录
+v????.??.??.0
+   新增缓存功能
+   拆分连接和读取超时
+   优化下载成功和失败提示
 v2022.2.24.0
 +   新增检查 HTTP 状态码的功能