Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDocCN
epub-crawler
比较版本
61438e71b8769cdddb5e6db88cf80781dadb7c55...aa8246325df937a0ab6d46f6e012cb0cb1a2fb41
E
epub-crawler
项目概览
OpenDocCN
/
epub-crawler
9 个月 前同步成功
通知
1
Star
22
Fork
6
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
epub-crawler
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
源分支
aa8246325df937a0ab6d46f6e012cb0cb1a2fb41
选择Git版本
...
目标分支
61438e71b8769cdddb5e6db88cf80781dadb7c55
选择Git版本
比较
Commits (7)
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/815225183cdc489c876611ad7be1e5a4917505f7
2022-03-16 14:09:55
2022-03-16T14:09:55+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/b179ad976cd8da679dda4d7ad669dc31e02341a4
2022-03-16 14:16:09
2022-03-16T14:16:09+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/276be0763032417fd136f70dff5a60169fe9de53
2022-03-16 14:23:32
2022-03-16T14:23:32+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/1a84f740da87c3aba9770c2b9d7270f2952bdbc2
2022-03-16 14:24:58
2022-03-16T14:24:58+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/47dd859ddefa06889d124e606e601d6d4dd0e887
2022-03-16 14:56:48
2022-03-16T14:56:48+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/b9a35693c8f2e96e5e0402592eec2a2e590495c0
2022-03-16 15:48:46
2022-03-16T15:48:46+08:00
wizardforcel
562826179@qq.com
https://gitcode.net/OpenDocCN/epub-crawler/-/commit/aa8246325df937a0ab6d46f6e012cb0cb1a2fb41
2022-03-16 15:49:28
2022-03-16T15:49:28+08:00
wizardforcel
562826179@qq.com
隐藏空白更改
内联
并排
Showing
4 changed file
with
68 addition
and
40 deletion
+68
-40
EpubCrawler/__main__.py
EpubCrawler/__main__.py
+42
-24
EpubCrawler/config.py
EpubCrawler/config.py
+4
-1
EpubCrawler/img.py
EpubCrawler/img.py
+16
-15
history.md
history.md
+6
-0
未找到文件。
EpubCrawler/__main__.py
浏览文件 @
aa824632
...
@@ -89,26 +89,32 @@ def tr_download_page_safe(url, art, imgs):
...
@@ -89,26 +89,32 @@ def tr_download_page_safe(url, art, imgs):
try
:
try
:
tr_download_page
(
url
,
art
,
imgs
)
tr_download_page
(
url
,
art
,
imgs
)
except
Exception
as
ex
:
except
Exception
as
ex
:
print
(
ex
)
print
(
f
'
{
url
}
下载失败:
{
ex
}
'
)
def
tr_download_page
(
url
,
art
,
imgs
):
def
tr_download_page
(
url
,
art
,
imgs
):
hash
=
hashlib
.
md5
(
url
.
encode
(
'utf-8'
)).
hexdigest
()
hash
=
hashlib
.
md5
(
url
.
encode
(
'utf-8'
)).
hexdigest
()
cache
=
load_article
(
hash
)
cache
=
load_article
(
hash
)
if
cache
is
None
:
if
cache
is
not
None
and
config
[
'cache'
]:
html
=
request_retry
(
'GET'
,
url
,
retry
=
config
[
'retry'
],
check_status
=
config
[
'checkStatus'
],
headers
=
config
[
'headers'
],
timeout
=
config
[
'timeout'
],
proxies
=
config
[
'proxy'
],
).
content
.
decode
(
config
[
'encoding'
],
'ignore'
)
art
.
update
(
get_article
(
html
,
url
))
save_article
(
hash
,
art
)
else
:
print
(
f
'
{
url
}
已存在于本地缓存中'
)
print
(
f
'
{
url
}
已存在于本地缓存中'
)
art
.
update
(
cache
)
art
.
update
(
cache
)
art
[
'content'
]
=
process_img
(
art
[
'content'
],
imgs
,
page_url
=
url
,
img_prefix
=
'../Images/'
,
)
return
html
=
request_retry
(
'GET'
,
url
,
retry
=
config
[
'retry'
],
check_status
=
config
[
'checkStatus'
],
headers
=
config
[
'headers'
],
timeout
=
config
[
'timeout'
],
proxies
=
config
[
'proxy'
],
).
content
.
decode
(
config
[
'encoding'
],
'ignore'
)
print
(
f
'
{
url
}
下载成功'
)
art
.
update
(
get_article
(
html
,
url
))
save_article
(
hash
,
art
)
art
[
'content'
]
=
process_img
(
art
[
'content'
]
=
process_img
(
art
[
'content'
],
imgs
,
art
[
'content'
],
imgs
,
page_url
=
url
,
page_url
=
url
,
...
@@ -117,30 +123,42 @@ def tr_download_page(url, art, imgs):
...
@@ -117,30 +123,42 @@ def tr_download_page(url, art, imgs):
time
.
sleep
(
config
[
'wait'
])
time
.
sleep
(
config
[
'wait'
])
def
main
(
):
def
update_config
(
user_cfg
):
global
get_toc
global
get_toc
global
get_article
global
get_article
cfg_fname
=
sys
.
argv
[
1
]
\
if
len
(
sys
.
argv
)
>
1
\
else
'config.json'
if
not
path
.
exists
(
cfg_fname
):
print
(
'please provide config file'
)
return
user_cfg
=
json
.
loads
(
open
(
cfg_fname
,
encoding
=
'utf-8'
).
read
())
config
.
update
(
user_cfg
)
config
.
update
(
user_cfg
)
if
config
[
'proxy'
]:
if
config
[
'proxy'
]:
proxies
=
{
proxies
=
{
'http'
:
config
[
'proxy'
],
'http'
:
config
[
'proxy'
],
'https'
:
config
[
'proxy'
],
'https'
:
config
[
'proxy'
],
}
}
config
[
'proxy'
]
=
proxies
config
[
'proxy'
]
=
proxies
set_img_pool
(
ThreadPoolExecutor
(
config
[
'imgThreads'
]))
set_img_pool
(
ThreadPoolExecutor
(
config
[
'imgThreads'
]))
if
config
[
'external'
]:
if
config
[
'external'
]:
mod
=
load_module
(
config
[
'external'
])
mod
=
load_module
(
config
[
'external'
])
get_toc
=
getattr
(
mod
,
'get_toc'
,
get_toc
)
get_toc
=
getattr
(
mod
,
'get_toc'
,
get_toc
)
get_article
=
getattr
(
mod
,
'get_article'
,
get_article
)
get_article
=
getattr
(
mod
,
'get_article'
,
get_article
)
if
not
config
[
'timeout'
]:
config
[
'timeout'
]
=
(
config
[
'connTimeout'
],
config
[
'readTimeout'
],
)
def
main
():
cfg_fname
=
sys
.
argv
[
1
]
\
if
len
(
sys
.
argv
)
>
1
\
else
'config.json'
if
not
path
.
exists
(
cfg_fname
):
print
(
'please provide config file'
)
return
user_cfg
=
json
.
loads
(
open
(
cfg_fname
,
encoding
=
'utf-8'
).
read
())
update_config
(
user_cfg
)
toc
=
get_toc_from_cfg
()
toc
=
get_toc_from_cfg
()
articles
=
[]
articles
=
[]
...
...
EpubCrawler/config.py
浏览文件 @
aa824632
...
@@ -17,11 +17,14 @@ config = {
...
@@ -17,11 +17,14 @@ config = {
'list'
:
[],
'list'
:
[],
'optiMode'
:
'quant'
,
'optiMode'
:
'quant'
,
'colors'
:
8
,
'colors'
:
8
,
'timeout'
:
8
,
'timeout'
:
None
,
'connTimeout'
:
1
,
'readTimeout'
:
60
,
'imgSrc'
:
[
'data-src'
,
'data-original-src'
,
'src'
],
'imgSrc'
:
[
'data-src'
,
'data-original-src'
,
'src'
],
'proxy'
:
''
,
'proxy'
:
''
,
'textThreads'
:
5
,
'textThreads'
:
5
,
'imgThreads'
:
5
,
'imgThreads'
:
5
,
'external'
:
None
,
'external'
:
None
,
'checkStatus'
:
False
,
'checkStatus'
:
False
,
'cache'
:
True
,
}
}
\ No newline at end of file
EpubCrawler/img.py
浏览文件 @
aa824632
...
@@ -29,27 +29,28 @@ def tr_download_img_safe(url, imgs, picname):
...
@@ -29,27 +29,28 @@ def tr_download_img_safe(url, imgs, picname):
try
:
try
:
tr_download_img
(
url
,
imgs
,
picname
)
tr_download_img
(
url
,
imgs
,
picname
)
except
Exception
as
ex
:
except
Exception
as
ex
:
print
(
ex
)
print
(
f
'
{
url
}
下载失败:
{
ex
}
'
)
imgs
[
picname
]
=
b
''
def
tr_download_img
(
url
,
imgs
,
picname
):
def
tr_download_img
(
url
,
imgs
,
picname
):
hash
=
hashlib
.
md5
(
url
.
encode
(
'utf-8'
)).
hexdigest
()
hash
=
hashlib
.
md5
(
url
.
encode
(
'utf-8'
)).
hexdigest
()
cache
=
load_img
(
hash
,
config
[
'optiMode'
])
cache
=
load_img
(
hash
,
config
[
'optiMode'
])
if
cache
is
None
:
if
cache
is
not
None
and
config
[
'cache'
]:
data
=
request_retry
(
'GET'
,
url
,
headers
=
config
[
'headers'
],
check_status
=
config
[
'checkStatus'
],
retry
=
config
[
'retry'
],
timeout
=
config
[
'timeout'
],
proxies
=
config
[
'proxy'
],
).
content
data
=
opti_img
(
data
,
config
[
'optiMode'
],
config
[
'colors'
])
or
b
''
save_img
(
hash
,
config
[
'optiMode'
],
data
)
else
:
print
(
f
'
{
url
}
已存在于本地缓存中'
)
print
(
f
'
{
url
}
已存在于本地缓存中'
)
data
=
cache
imgs
[
picname
]
=
cache
return
data
=
request_retry
(
'GET'
,
url
,
headers
=
config
[
'headers'
],
check_status
=
config
[
'checkStatus'
],
retry
=
config
[
'retry'
],
timeout
=
config
[
'timeout'
],
proxies
=
config
[
'proxy'
],
).
content
print
(
f
'
{
url
}
下载成功'
)
data
=
opti_img
(
data
,
config
[
'optiMode'
],
config
[
'colors'
])
or
b
''
imgs
[
picname
]
=
data
imgs
[
picname
]
=
data
save_img
(
hash
,
config
[
'optiMode'
],
data
)
time
.
sleep
(
config
[
'wait'
])
time
.
sleep
(
config
[
'wait'
])
def
process_img_data_url
(
url
,
el_img
,
imgs
,
**
kw
):
def
process_img_data_url
(
url
,
el_img
,
imgs
,
**
kw
):
...
...
history.md
浏览文件 @
aa824632
# 历史记录
# 历史记录
v????.??.??.0
+
新增缓存功能
+
拆分连接和读取超时
+
优化下载成功和失败提示
v2022.2.24.0
v2022.2.24.0
+
新增检查 HTTP 状态码的功能
+
新增检查 HTTP 状态码的功能
...
...