...
 
Commits (7)
    https://gitcode.net/OpenDocCN/epub-crawler/-/commit/815225183cdc489c876611ad7be1e5a4917505f7 2022-03-16 14:09:55 2022-03-16T14:09:55+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/b179ad976cd8da679dda4d7ad669dc31e02341a4 2022-03-16 14:16:09 2022-03-16T14:16:09+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/276be0763032417fd136f70dff5a60169fe9de53 2022-03-16 14:23:32 2022-03-16T14:23:32+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/1a84f740da87c3aba9770c2b9d7270f2952bdbc2 2022-03-16 14:24:58 2022-03-16T14:24:58+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/47dd859ddefa06889d124e606e601d6d4dd0e887 2022-03-16 14:56:48 2022-03-16T14:56:48+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/b9a35693c8f2e96e5e0402592eec2a2e590495c0 2022-03-16 15:48:46 2022-03-16T15:48:46+08:00 wizardforcel 562826179@qq.com https://gitcode.net/OpenDocCN/epub-crawler/-/commit/aa8246325df937a0ab6d46f6e012cb0cb1a2fb41 2022-03-16 15:49:28 2022-03-16T15:49:28+08:00 wizardforcel 562826179@qq.com
...@@ -89,26 +89,32 @@ def tr_download_page_safe(url, art, imgs): ...@@ -89,26 +89,32 @@ def tr_download_page_safe(url, art, imgs):
try: try:
tr_download_page(url, art, imgs) tr_download_page(url, art, imgs)
except Exception as ex: except Exception as ex:
print(ex) print(f'{url} 下载失败:{ex}')
def tr_download_page(url, art, imgs): def tr_download_page(url, art, imgs):
hash = hashlib.md5(url.encode('utf-8')).hexdigest() hash = hashlib.md5(url.encode('utf-8')).hexdigest()
cache = load_article(hash) cache = load_article(hash)
if cache is None: if cache is not None and config['cache']:
html = request_retry(
'GET', url,
retry=config['retry'],
check_status=config['checkStatus'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
).content.decode(config['encoding'], 'ignore')
art.update(get_article(html, url))
save_article(hash, art)
else:
print(f'{url} 已存在于本地缓存中') print(f'{url} 已存在于本地缓存中')
art.update(cache) art.update(cache)
art['content'] = process_img(
art['content'], imgs,
page_url=url,
img_prefix='../Images/',
)
return
html = request_retry(
'GET', url,
retry=config['retry'],
check_status=config['checkStatus'],
headers=config['headers'],
timeout=config['timeout'],
proxies=config['proxy'],
).content.decode(config['encoding'], 'ignore')
print(f'{url} 下载成功')
art.update(get_article(html, url))
save_article(hash, art)
art['content'] = process_img( art['content'] = process_img(
art['content'], imgs, art['content'], imgs,
page_url=url, page_url=url,
...@@ -117,30 +123,42 @@ def tr_download_page(url, art, imgs): ...@@ -117,30 +123,42 @@ def tr_download_page(url, art, imgs):
time.sleep(config['wait']) time.sleep(config['wait'])
def main(): def update_config(user_cfg):
global get_toc global get_toc
global get_article global get_article
cfg_fname = sys.argv[1] \
if len(sys.argv) > 1 \
else 'config.json'
if not path.exists(cfg_fname):
print('please provide config file')
return
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
config.update(user_cfg) config.update(user_cfg)
if config['proxy']: if config['proxy']:
proxies = { proxies = {
'http': config['proxy'], 'http': config['proxy'],
'https': config['proxy'], 'https': config['proxy'],
} }
config['proxy'] = proxies config['proxy'] = proxies
set_img_pool(ThreadPoolExecutor(config['imgThreads'])) set_img_pool(ThreadPoolExecutor(config['imgThreads']))
if config['external']: if config['external']:
mod = load_module(config['external']) mod = load_module(config['external'])
get_toc = getattr(mod, 'get_toc', get_toc) get_toc = getattr(mod, 'get_toc', get_toc)
get_article = getattr(mod, 'get_article', get_article) get_article = getattr(mod, 'get_article', get_article)
if not config['timeout']:
config['timeout'] = (
config['connTimeout'],
config['readTimeout'],
)
def main():
cfg_fname = sys.argv[1] \
if len(sys.argv) > 1 \
else 'config.json'
if not path.exists(cfg_fname):
print('please provide config file')
return
user_cfg = json.loads(open(cfg_fname, encoding='utf-8').read())
update_config(user_cfg)
toc = get_toc_from_cfg() toc = get_toc_from_cfg()
articles = [] articles = []
......
...@@ -17,11 +17,14 @@ config = { ...@@ -17,11 +17,14 @@ config = {
'list': [], 'list': [],
'optiMode': 'quant', 'optiMode': 'quant',
'colors': 8, 'colors': 8,
'timeout': 8, 'timeout': None,
'connTimeout': 1,
'readTimeout': 60,
'imgSrc': ['data-src', 'data-original-src', 'src'], 'imgSrc': ['data-src', 'data-original-src', 'src'],
'proxy': '', 'proxy': '',
'textThreads': 5, 'textThreads': 5,
'imgThreads': 5, 'imgThreads': 5,
'external': None, 'external': None,
'checkStatus': False, 'checkStatus': False,
'cache': True,
} }
\ No newline at end of file
...@@ -29,27 +29,28 @@ def tr_download_img_safe(url, imgs, picname): ...@@ -29,27 +29,28 @@ def tr_download_img_safe(url, imgs, picname):
try: try:
tr_download_img(url, imgs, picname) tr_download_img(url, imgs, picname)
except Exception as ex: except Exception as ex:
print(ex) print(f'{url} 下载失败:{ex}')
imgs[picname] = b''
def tr_download_img(url, imgs, picname): def tr_download_img(url, imgs, picname):
hash = hashlib.md5(url.encode('utf-8')).hexdigest() hash = hashlib.md5(url.encode('utf-8')).hexdigest()
cache = load_img(hash, config['optiMode']) cache = load_img(hash, config['optiMode'])
if cache is None: if cache is not None and config['cache']:
data = request_retry(
'GET', url,
headers=config['headers'],
check_status=config['checkStatus'],
retry=config['retry'],
timeout=config['timeout'],
proxies=config['proxy'],
).content
data = opti_img(data, config['optiMode'], config['colors']) or b''
save_img(hash, config['optiMode'], data)
else:
print(f'{url} 已存在于本地缓存中') print(f'{url} 已存在于本地缓存中')
data = cache imgs[picname] = cache
return
data = request_retry(
'GET', url,
headers=config['headers'],
check_status=config['checkStatus'],
retry=config['retry'],
timeout=config['timeout'],
proxies=config['proxy'],
).content
print(f'{url} 下载成功')
data = opti_img(data, config['optiMode'], config['colors']) or b''
imgs[picname] = data imgs[picname] = data
save_img(hash, config['optiMode'], data)
time.sleep(config['wait']) time.sleep(config['wait'])
def process_img_data_url(url, el_img, imgs, **kw): def process_img_data_url(url, el_img, imgs, **kw):
......
# 历史记录 # 历史记录
v????.??.??.0
+ 新增缓存功能
+ 拆分连接和读取超时
+ 优化下载成功和失败提示
v2022.2.24.0 v2022.2.24.0
+ 新增检查 HTTP 状态码的功能 + 新增检查 HTTP 状态码的功能
......