Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
CSDN 技术社区
content
前端体验设计
提交
3b227029
前
前端体验设计
项目概览
CSDN 技术社区
/
content
/
前端体验设计
通知
1
Star
2
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
前
前端体验设计
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
3b227029
编写于
12月 27, 2021
作者:
ToTensor
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modify ignore
上级
3ca29f69
变更
7
展开全部
显示空白变更内容
内联
并排
Showing
7 changed file
with
4 addition
and
362 deletion
+4
-362
.gitignore
.gitignore
+4
-1
data/前端体验设计——HTML5+CSS3终极修炼.json
data/前端体验设计——HTML5+CSS3终极修炼.json
+0
-0
main.py
main.py
+0
-16
src/ebook/community.py
src/ebook/community.py
+0
-125
src/ebook/ebook_get_request.py
src/ebook/ebook_get_request.py
+0
-44
src/ebook/extract_book_code.py
src/ebook/extract_book_code.py
+0
-158
src/ebook/get_book_chapter_id_list.py
src/ebook/get_book_chapter_id_list.py
+0
-18
未找到文件。
.gitignore
浏览文件 @
3b227029
__pycache__
src
main.py
data/前端体验设计——HTML5+CSS3终极修炼.json
\ No newline at end of file
data/前端体验设计——HTML5+CSS3终极修炼.json
已删除
100644 → 0
浏览文件 @
3ca29f69
此差异已折叠。
点击以展开。
main.py
已删除
100644 → 0
浏览文件 @
3ca29f69
from
src.ebook.extract_book_code
import
extract_code
from
src.ebook.community
import
send_topic
if
__name__
==
"__main__"
:
book_mapping
=
{
"前端体验设计——HTML5+CSS3终极修炼"
:
"c4eeb42b07f54b42a9fd1568b8ec4b98"
,
}
for
key
in
book_mapping
.
keys
():
extract_code
(
book_mapping
)
web_url
=
'https://gitcode.net/csdn/content/book_code_{}/-/tree/master/'
.
format
(
book_mapping
[
key
])
print
(
'-------'
*
20
)
print
(
'开始向社区发帖'
)
book_dir
=
'data/{}/'
.
format
(
key
)
mapping_path
=
'data/{}.json'
.
format
(
key
)
send_topic
(
web_url
,
book_dir
,
mapping_path
)
\ No newline at end of file
src/ebook/community.py
已删除
100644 → 0
浏览文件 @
3ca29f69
import
os
import
json
import
html
import
requests
import
logging
logger
=
logging
.
getLogger
(
__name__
)
def
get_files_path
(
file_dir
,
filetype
=
'.txt'
):
"""得到文件夹下的所有.txt文件的路径
Args:
file_dir: 文件夹路径
filetype: 文件后缀
Returns:
所有filetype类型文件的绝对路径
"""
files_path
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
file_dir
):
for
file
in
files
:
if
filetype
is
None
or
(
os
.
path
.
splitext
(
file
)[
1
]
==
filetype
):
files_path
.
append
(
os
.
path
.
join
(
root
,
file
))
return
files_path
def
get_all_files
(
current_address
):
files
=
[]
for
parent
,
dirnames
,
filenames
in
os
.
walk
(
current_address
):
# Case1: traversal the directories
# for dirname in dirnames:
# print("Parent folder:", parent)
# print("Dirname:", dirname)
# # Case2: traversal the files
for
filename
in
filenames
:
# print("Parent folder:", parent)
file_path
=
os
.
path
.
join
(
parent
,
filename
)
files
.
append
(
file_path
)
return
files
def
post
(
url
,
params
,
retry
=
3
,
headers
=
None
):
if
headers
is
None
:
hdrs
=
{
"Content-Type"
:
"application/json"
}
else
:
hdrs
=
headers
fails
=
0
while
fails
<
retry
:
try
:
if
headers
is
None
:
data
=
json
.
dumps
(
params
)
else
:
data
=
params
logger
.
debug
(
f
"will post
{
data
}
to
{
url
}
"
)
resp
=
requests
.
post
(
url
,
data
,
headers
=
hdrs
,
timeout
=
10
)
if
resp
:
logger
.
info
(
f
"resp
{
resp
.
content
}
"
)
return
resp
.
json
()
else
:
logger
.
error
(
f
"resp: [
{
resp
}
]"
)
fails
+=
1
except
Exception
as
error
:
logger
.
error
(
f
"post
{
params
}
to
{
url
}
failed
{
error
}
"
)
fails
+=
1
if
fails
>
retry
:
raise
error
def
send_topic
(
web_url
,
book_dir
,
mapping_path
):
data_dir
=
'data'
# web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
request_url
=
'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
# files = get_files_path('data/全程软件测试(第3版)', '.java')
files
=
get_all_files
(
book_dir
)
print
(
files
)
if
not
os
.
path
.
exists
(
mapping_path
):
chapter_code_mapping
=
{}
save_mapping
=
json
.
dumps
(
chapter_code_mapping
,
ensure_ascii
=
False
,
indent
=
2
)
with
open
(
mapping_path
,
'w'
)
as
f
:
f
.
write
(
save_mapping
)
with
open
(
mapping_path
,
'r'
)
as
f
:
chapter_code_mapping
=
json
.
load
(
f
)
for
file
in
files
:
topic_title
=
file
.
replace
(
book_dir
,
''
)
topic_title
=
topic_title
.
replace
(
'/'
,
'|'
)
topic_title
=
topic_title
.
replace
(
' '
,
'.'
)
# topic_title = html.escape(topic_title)
topic_content
=
web_url
+
file
topic_content
=
"代码:<a href=
\"
{}
\"
>{}</a>"
.
format
(
topic_content
,
topic_title
)
print
(
topic_title
)
send_topic_request_param
=
{
"type"
:
"long_text"
,
"cateId"
:
20967
,
"content"
:
topic_content
,
"topicTitle"
:
topic_title
,
"mdContent"
:
topic_content
,
"communityId"
:
3823
,
"loginUserName"
:
"BBS_Assistant"
,
"bizNo"
:
"ebook"
}
if
chapter_code_mapping
.
get
(
file
)
is
None
:
resp
=
post
(
request_url
,
send_topic_request_param
)
topic_link
=
resp
[
'data'
][
'content'
][
'url'
]
chapter_code_mapping
[
file
]
=
topic_link
print
(
'{}:{}'
.
format
(
file
,
topic_link
))
save_mapping
=
json
.
dumps
(
chapter_code_mapping
,
ensure_ascii
=
False
,
indent
=
2
)
with
open
(
mapping_path
,
'w'
)
as
f
:
f
.
write
(
save_mapping
)
else
:
send_topic_request_param
[
'id'
]
=
int
(
chapter_code_mapping
[
file
].
split
(
'/'
)[
-
1
])
resp
=
post
(
request_url
,
send_topic_request_param
)
print
(
'{}:{}'
.
format
(
file
,
chapter_code_mapping
.
get
(
file
)))
src/ebook/ebook_get_request.py
已删除
100644 → 0
浏览文件 @
3ca29f69
import
json
import
requests
import
logging
logger
=
logging
.
getLogger
(
__name__
)
def
get_chapter_content
(
params
):
url
=
'http://192.168.50.117:9003/v1/chapter/content'
headers
=
{
"Cookie"
:
"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result
=
requests
.
get
(
url
=
url
,
params
=
params
,
headers
=
headers
)
if
result
.
status_code
==
200
:
ret
=
json
.
loads
(
result
.
text
)
logger
.
info
(
'request success'
)
content
=
ret
[
'data'
]
return
content
else
:
logger
.
info
(
'request failed!!!!!'
)
return
{}
def
get_chapter_list
(
params
):
url
=
'http://192.168.50.117:9003/inner/v1/chapter/list'
headers
=
{
"Cookie"
:
"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result
=
requests
.
get
(
url
=
url
,
params
=
params
,
headers
=
headers
)
if
result
.
status_code
==
200
:
ret
=
json
.
loads
(
result
.
text
)
logger
.
info
(
'request success'
)
content
=
ret
[
'data'
]
return
content
else
:
logger
.
info
(
'request failed!!!!!'
)
return
{}
\ No newline at end of file
src/ebook/extract_book_code.py
已删除
100644 → 0
浏览文件 @
3ca29f69
import
json
import
os
import
re
import
html
from
bs4
import
BeautifulSoup
from
.get_book_chapter_id_list
import
get_chapter_id_list
from
.ebook_get_request
import
get_chapter_content
def
extract_code
(
book_mapping
):
# book_mapping_path = "data/book_mapping.json"
# with open(book_mapping_path, "r") as f:
# book_mapping = json.load(f)
for
book_idx
,
book_name
in
enumerate
(
book_mapping
.
keys
()):
book_dir_name
=
book_name
book_dir
=
os
.
path
.
join
(
'data'
,
book_dir_name
)
if
not
os
.
path
.
exists
(
book_dir
):
os
.
mkdir
(
book_dir
)
# print(book_dir_name)
book_id
=
book_mapping
[
book_name
]
request_get_chapter_id_list_params
=
{
"bookId"
:
book_id
,
"is_main"
:
1
}
chapter_id_list
=
get_chapter_id_list
(
request_get_chapter_id_list_params
)
print
(
chapter_id_list
)
for
chapter_id
in
chapter_id_list
:
print
(
'当前章节id: {}'
.
format
(
chapter_id
))
request_get_chapter_content_params
=
{
'bookId'
:
book_id
,
'chapterId'
:
chapter_id
}
chapter_resp
=
get_chapter_content
(
request_get_chapter_content_params
)
chapter_name
=
chapter_resp
[
'name'
]
chapter_content
=
chapter_resp
[
'content'
]
try
:
if
book_name
==
"零基础学机器学习"
:
chapter_num
=
re
.
findall
(
r
'第(.*)课'
,
chapter_name
)[
0
]
chapter_name_modify
=
re
.
sub
(
r
'第(.*)课'
,
r
'第{}课'
.
format
(
chapter_num
.
zfill
(
2
)),
chapter_name
)
else
:
chapter_num
=
re
.
findall
(
r
'第(.*)章'
,
chapter_name
)[
0
]
chapter_name_modify
=
re
.
sub
(
r
'第(.*)章'
,
r
'第{}章'
.
format
(
chapter_num
.
zfill
(
2
)),
chapter_name
)
chapter_name
=
chapter_name_modify
print
(
chapter_name
)
except
:
print
(
'该章节没有章节序号: {}'
.
format
(
chapter_name
))
pass
chapter_dir
=
os
.
path
.
join
(
book_dir
,
chapter_name
)
if
not
os
.
path
.
exists
(
chapter_dir
):
os
.
mkdir
(
chapter_dir
)
# print('创建文件夹: {}'.format(chapter_dir))
chapter_content
=
html
.
unescape
(
chapter_content
)
# print(chapter_content)
section_list
=
re
.
findall
(
r
'<h2.*?>(.*?)</h2>'
,
chapter_content
,
flags
=
re
.
S
)
print
(
section_list
)
section_content_list
=
re
.
split
(
r
'<h2.*?>.*?</h2>'
,
chapter_content
,
flags
=
re
.
S
)
section_dir_list
=
[]
for
idx
,
section
in
enumerate
(
section_list
):
section
=
section
.
replace
(
' '
,
' '
)
if
section
.
find
(
r
'/'
)
!=
-
1
:
section
=
section
.
replace
(
'/'
,
''
)
section_dir
=
os
.
path
.
join
(
chapter_dir
,
'{}.{}'
.
format
(
idx
+
1
,
section
))
print
(
section_dir
)
if
not
os
.
path
.
exists
(
section_dir
):
os
.
mkdir
(
section_dir
)
section_dir_list
.
append
(
section_dir
)
for
idx
,
section_content
in
enumerate
(
section_content_list
):
if
idx
==
0
:
html_save_path
=
os
.
path
.
join
(
chapter_dir
,
'text.html'
)
else
:
html_save_path
=
os
.
path
.
join
(
section_dir_list
[
idx
-
1
],
'text.html'
)
# with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content)
code_list
=
re
.
findall
(
r
'<code>(.*?)</code>'
,
section_content
,
re
.
S
)
res_codelist
=
[]
for
code
in
code_list
:
code
=
code
.
strip
()
if
code
!=
''
:
res_codelist
.
append
(
code
)
# print(res_codelist)
# break
count
=
0
for
code
in
res_codelist
:
if
len
(
code
.
split
(
'
\n
'
))
<
2
:
continue
# code = html.unescape(code)
# soup = BeautifulSoup(code)
# clean_code = soup.get_text()
# print(clean_code)
# print('-------' * 10)
# pianduan_name = re.findall(r'(代码片段.*),', clean_code)
# if pianduan_name == []:
# pianduan_name_str = ''
# else:
# pianduan_name_str = pianduan_name[0]
# file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
# print(file_name_list)
# if file_name_list == []:
# file_name = '.txt'
# else:
# file_name = file_name_list[0]
# file_name = file_name.replace('/', '-')
# save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name)
if
idx
==
0
:
code_save_path
=
os
.
path
.
join
(
chapter_dir
,
'code_0.css'
)
else
:
count
+=
1
code_save_path
=
os
.
path
.
join
(
section_dir_list
[
idx
-
1
],
'code_{}.css'
.
format
(
count
))
# res_code_list = []
# for line in clean_code.split('\n'):
# if line.find('文件名') != -1 or line.find(
# '代码片段') != -1 or line == '':
# continue
# clean_line = re.findall(r'^\d{1,5}\: *(.*)',
# line)[0]
# res_code_list.append(clean_line)
# res_code = '\n'.join(res_code_list)
with
open
(
code_save_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
code
)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)
src/ebook/get_book_chapter_id_list.py
已删除
100644 → 0
浏览文件 @
3ca29f69
import
json
import
re
import
html
import
nltk
import
html2text
import
os
import
pandas
as
pd
from
bs4
import
BeautifulSoup
from
.ebook_get_request
import
get_chapter_list
def
get_chapter_id_list
(
param
):
chapter_list
=
[]
ret
=
get_chapter_list
(
param
)
for
item
in
ret
:
chapterid
=
item
[
'chapterid'
]
chapter_list
.
append
(
chapterid
)
return
chapter_list
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录