Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
CSDN 技术社区
content
深入剖析Nginx
提交
fd576066
深
深入剖析Nginx
项目概览
CSDN 技术社区
/
content
/
深入剖析Nginx
通知
5
Star
3
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
深
深入剖析Nginx
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
fd576066
编写于
12月 27, 2021
作者:
ToTensor
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modify ignore
上级
39af7cd8
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
4 addition
and
357 deletion
+4
-357
.gitignore
.gitignore
+4
-1
data/深入剖析Nginx.json
data/深入剖析Nginx.json
+0
-0
main.py
main.py
+0
-9
src/ebook/community.py
src/ebook/community.py
+0
-128
src/ebook/ebook_get_request.py
src/ebook/ebook_get_request.py
+0
-44
src/ebook/extract_book_code.py
src/ebook/extract_book_code.py
+0
-157
src/ebook/get_book_chapter_id_list.py
src/ebook/get_book_chapter_id_list.py
+0
-18
未找到文件。
.gitignore
浏览文件 @
fd576066
__pycache__
__pycache__
\ No newline at end of file
src
main.py
data/深入剖析Nginx.json
\ No newline at end of file
data/深入剖析Nginx.json
已删除
100644 → 0
浏览文件 @
39af7cd8
此差异已折叠。
点击以展开。
main.py
已删除
100644 → 0
浏览文件 @
39af7cd8
from
src.ebook.extract_book_code
import
extract_code
from
src.ebook.community
import
send_topic
if
__name__
==
"__main__"
:
# extract_code()
web_url
=
'https://gitcode.net/csdn/content/book_id_08fd0c7025a4a34a97a29897b067d24/-/tree/master/'
print
(
'-------'
*
20
)
print
(
'开始向社区发帖'
)
send_topic
(
web_url
)
\ No newline at end of file
src/ebook/community.py
已删除
100644 → 0
浏览文件 @
39af7cd8
import
os
import
json
import
html
import
requests
import
logging
logger
=
logging
.
getLogger
(
__name__
)
def
get_files_path
(
file_dir
,
filetype
=
'.txt'
):
"""得到文件夹下的所有.txt文件的路径
Args:
file_dir: 文件夹路径
filetype: 文件后缀
Returns:
所有filetype类型文件的绝对路径
"""
files_path
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
file_dir
):
for
file
in
files
:
if
filetype
is
None
or
(
os
.
path
.
splitext
(
file
)[
1
]
==
filetype
):
files_path
.
append
(
os
.
path
.
join
(
root
,
file
))
return
files_path
def
get_all_files
(
current_address
):
files
=
[]
for
parent
,
dirnames
,
filenames
in
os
.
walk
(
current_address
):
# Case1: traversal the directories
# for dirname in dirnames:
# print("Parent folder:", parent)
# print("Dirname:", dirname)
# # Case2: traversal the files
for
filename
in
filenames
:
# print("Parent folder:", parent)
file_path
=
os
.
path
.
join
(
parent
,
filename
)
files
.
append
(
file_path
)
return
files
def
post
(
url
,
params
,
retry
=
3
,
headers
=
None
):
if
headers
is
None
:
hdrs
=
{
"Content-Type"
:
"application/json"
}
else
:
hdrs
=
headers
fails
=
0
while
fails
<
retry
:
try
:
if
headers
is
None
:
data
=
json
.
dumps
(
params
)
else
:
data
=
params
logger
.
debug
(
f
"will post
{
data
}
to
{
url
}
"
)
resp
=
requests
.
post
(
url
,
data
,
headers
=
hdrs
,
timeout
=
10
)
if
resp
:
logger
.
info
(
f
"resp
{
resp
.
content
}
"
)
return
resp
.
json
()
else
:
logger
.
error
(
f
"resp: [
{
resp
}
]"
)
fails
+=
1
except
Exception
as
error
:
logger
.
error
(
f
"post
{
params
}
to
{
url
}
failed
{
error
}
"
)
fails
+=
1
if
fails
>
retry
:
raise
error
def
send_topic
(
web_url
):
data_dir
=
'data'
book_dir
=
'data/深入剖析Nginx/'
# web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
request_url
=
'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
# files = get_files_path('data/全程软件测试(第3版)', '.java')
files
=
get_all_files
(
book_dir
)
print
(
files
)
mapping_path
=
'data/深入剖析Nginx.json'
if
not
os
.
path
.
exists
(
mapping_path
):
chapter_code_mapping
=
{}
save_mapping
=
json
.
dumps
(
chapter_code_mapping
,
ensure_ascii
=
False
,
indent
=
2
)
with
open
(
mapping_path
,
'w'
)
as
f
:
f
.
write
(
save_mapping
)
with
open
(
mapping_path
,
'r'
)
as
f
:
chapter_code_mapping
=
json
.
load
(
f
)
for
file
in
files
:
topic_title
=
file
.
replace
(
book_dir
,
''
)
topic_title
=
topic_title
.
replace
(
'/'
,
'|'
)
topic_title
=
topic_title
.
replace
(
' '
,
'.'
)
# topic_title = html.escape(topic_title)
topic_content
=
web_url
+
file
topic_content
=
"代码:<a href=
\"
{}
\"
>{}</a>"
.
format
(
topic_content
,
topic_title
)
print
(
topic_title
)
send_topic_request_param
=
{
"type"
:
"long_text"
,
"cateId"
:
20966
,
"content"
:
topic_content
,
"topicTitle"
:
topic_title
,
"mdContent"
:
topic_content
,
"communityId"
:
3821
,
"loginUserName"
:
"BBS_Assistant"
,
"bizNo"
:
"ebook"
}
if
chapter_code_mapping
.
get
(
file
)
is
None
:
resp
=
post
(
request_url
,
send_topic_request_param
)
topic_link
=
resp
[
'data'
][
'content'
][
'url'
]
chapter_code_mapping
[
file
]
=
topic_link
print
(
'{}:{}'
.
format
(
file
,
topic_link
))
save_mapping
=
json
.
dumps
(
chapter_code_mapping
,
ensure_ascii
=
False
,
indent
=
2
)
with
open
(
mapping_path
,
'w'
)
as
f
:
f
.
write
(
save_mapping
)
else
:
send_topic_request_param
[
'id'
]
=
int
(
chapter_code_mapping
[
file
].
split
(
'/'
)[
-
1
])
resp
=
post
(
request_url
,
send_topic_request_param
)
print
(
'{}:{}'
.
format
(
file
,
chapter_code_mapping
.
get
(
file
)))
src/ebook/ebook_get_request.py
已删除
100644 → 0
浏览文件 @
39af7cd8
import
json
import
requests
import
logging
logger
=
logging
.
getLogger
(
__name__
)
def
get_chapter_content
(
params
):
url
=
'http://192.168.50.117:9003/v1/chapter/content'
headers
=
{
"Cookie"
:
"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result
=
requests
.
get
(
url
=
url
,
params
=
params
,
headers
=
headers
)
if
result
.
status_code
==
200
:
ret
=
json
.
loads
(
result
.
text
)
logger
.
info
(
'request success'
)
content
=
ret
[
'data'
]
return
content
else
:
logger
.
info
(
'request failed!!!!!'
)
return
{}
def
get_chapter_list
(
params
):
url
=
'http://192.168.50.117:9003/inner/v1/chapter/list'
headers
=
{
"Cookie"
:
"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result
=
requests
.
get
(
url
=
url
,
params
=
params
,
headers
=
headers
)
if
result
.
status_code
==
200
:
ret
=
json
.
loads
(
result
.
text
)
logger
.
info
(
'request success'
)
content
=
ret
[
'data'
]
return
content
else
:
logger
.
info
(
'request failed!!!!!'
)
return
{}
\ No newline at end of file
src/ebook/extract_book_code.py
已删除
100644 → 0
浏览文件 @
39af7cd8
import
json
import
os
import
re
import
html
from
bs4
import
BeautifulSoup
from
.get_book_chapter_id_list
import
get_chapter_id_list
from
.ebook_get_request
import
get_chapter_content
def
extract_code
():
# book_mapping_path = "data/book_mapping.json"
# with open(book_mapping_path, "r") as f:
# book_mapping = json.load(f)
book_mapping
=
{
"深入剖析Nginx"
:
"608fd0c7025a4a34a97a29897b067d24"
,
}
for
book_idx
,
book_name
in
enumerate
(
book_mapping
.
keys
()):
book_dir_name
=
book_name
book_dir
=
os
.
path
.
join
(
'data'
,
book_dir_name
)
if
not
os
.
path
.
exists
(
book_dir
):
os
.
mkdir
(
book_dir
)
# print(book_dir_name)
book_id
=
book_mapping
[
book_name
]
request_get_chapter_id_list_params
=
{
"bookId"
:
book_id
,
"is_main"
:
1
}
chapter_id_list
=
get_chapter_id_list
(
request_get_chapter_id_list_params
)
# print(chapter_id_list)
for
chapter_id
in
chapter_id_list
:
request_get_chapter_content_params
=
{
'bookId'
:
book_id
,
'chapterId'
:
chapter_id
}
chapter_resp
=
get_chapter_content
(
request_get_chapter_content_params
)
chapter_name
=
chapter_resp
[
'name'
]
chapter_content
=
chapter_resp
[
'content'
]
try
:
if
book_name
==
"零基础学机器学习"
:
chapter_num
=
re
.
findall
(
r
'第(.*)课'
,
chapter_name
)[
0
]
chapter_name_modify
=
re
.
sub
(
r
'第(.*)课'
,
r
'第{}课'
.
format
(
chapter_num
.
zfill
(
2
)),
chapter_name
)
else
:
chapter_num
=
re
.
findall
(
r
'第(.*)章'
,
chapter_name
)[
0
]
chapter_name_modify
=
re
.
sub
(
r
'第(.*)章'
,
r
'第{}章'
.
format
(
chapter_num
.
zfill
(
2
)),
chapter_name
)
chapter_name
=
chapter_name_modify
except
:
# print('该章节没有章节序号: {}'.format(chapter_name))
pass
chapter_dir
=
os
.
path
.
join
(
book_dir
,
chapter_name
)
if
not
os
.
path
.
exists
(
chapter_dir
):
os
.
mkdir
(
chapter_dir
)
# print('创建文件夹: {}'.format(chapter_dir))
chapter_content
=
html
.
unescape
(
chapter_content
)
# print(chapter_content)
if
book_name
==
"深入剖析Nginx"
:
section_list
=
re
.
findall
(
r
'<h2.*?><a>(.*?)</a></h2>'
,
chapter_content
,
flags
=
re
.
S
)
# print(section_list)
section_content_list
=
re
.
split
(
r
'<h2.*?>.*?</h2>'
,
chapter_content
,
flags
=
re
.
S
)
section_dir_list
=
[]
for
section
in
section_list
:
section
=
section
.
replace
(
' '
,
' '
)
if
section
.
find
(
r
'/'
)
!=
-
1
:
section
=
section
.
replace
(
'/'
,
''
)
section_dir
=
os
.
path
.
join
(
chapter_dir
,
section
)
# print(section_dir)
if
not
os
.
path
.
exists
(
section_dir
):
os
.
mkdir
(
section_dir
)
section_dir_list
.
append
(
section_dir
)
for
idx
,
section_content
in
enumerate
(
section_content_list
):
if
idx
==
0
:
html_save_path
=
os
.
path
.
join
(
chapter_dir
,
'text.html'
)
else
:
html_save_path
=
os
.
path
.
join
(
section_dir_list
[
idx
-
1
],
'text.html'
)
# with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content)
code_list
=
re
.
findall
(
r
'(?:(?:<p class="left">\d{1,5}\:.*? \n).*?)*'
,
section_content
,
flags
=
re
.
S
)
res_codelist
=
[]
for
code
in
code_list
:
if
code
!=
''
:
res_codelist
.
append
(
code
)
# print(res_codelist)
# break
count
=
0
for
code
in
res_codelist
:
code
=
html
.
unescape
(
code
)
soup
=
BeautifulSoup
(
code
)
clean_code
=
soup
.
get_text
()
print
(
clean_code
)
print
(
'-------'
*
10
)
pianduan_name
=
re
.
findall
(
r
'(代码片段.*),'
,
clean_code
)
if
pianduan_name
==
[]:
pianduan_name_str
=
''
else
:
pianduan_name_str
=
pianduan_name
[
0
]
file_name_list
=
re
.
findall
(
r
'文件名: (.*)\n'
,
clean_code
)
print
(
file_name_list
)
if
file_name_list
==
[]:
file_name
=
'.txt'
else
:
file_name
=
file_name_list
[
0
]
file_name
=
file_name
.
replace
(
'/'
,
'-'
)
save_file_name
=
pianduan_name_str
+
'-'
+
file_name
# print(save_file_name)
if
idx
==
0
:
code_save_path
=
os
.
path
.
join
(
chapter_dir
,
save_file_name
)
else
:
count
+=
1
code_save_path
=
os
.
path
.
join
(
section_dir_list
[
idx
-
1
],
save_file_name
)
res_code_list
=
[]
for
line
in
clean_code
.
split
(
'
\n
'
):
if
line
.
find
(
'文件名'
)
!=
-
1
or
line
.
find
(
'代码片段'
)
!=
-
1
or
line
==
''
:
continue
clean_line
=
re
.
findall
(
r
'^\d{1,5}\: *(.*)'
,
line
)[
0
]
res_code_list
.
append
(
clean_line
)
res_code
=
'
\n
'
.
join
(
res_code_list
)
with
open
(
code_save_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
res_code
)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)
src/ebook/get_book_chapter_id_list.py
已删除
100644 → 0
浏览文件 @
39af7cd8
import
json
import
re
import
html
import
nltk
import
html2text
import
os
import
pandas
as
pd
from
bs4
import
BeautifulSoup
from
.ebook_get_request
import
get_chapter_list
def
get_chapter_id_list
(
param
):
chapter_list
=
[]
ret
=
get_chapter_list
(
param
)
for
item
in
ret
:
chapterid
=
item
[
'chapterid'
]
chapter_list
.
append
(
chapterid
)
return
chapter_list
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录