提交 fd576066 编写于 作者: ToTensor's avatar ToTensor

modify ignore

上级 39af7cd8
__pycache__ __pycache__
\ No newline at end of file src
main.py
data/深入剖析Nginx.json
\ No newline at end of file
此差异已折叠。
from src.ebook.extract_book_code import extract_code
from src.ebook.community import send_topic
if __name__ == "__main__":
# extract_code()
web_url = 'https://gitcode.net/csdn/content/book_id_08fd0c7025a4a34a97a29897b067d24/-/tree/master/'
print('-------' * 20)
print('开始向社区发帖')
send_topic(web_url)
\ No newline at end of file
import os
import json
import html
import requests
import logging
logger = logging.getLogger(__name__)
def get_files_path(file_dir, filetype='.txt'):
"""得到文件夹下的所有.txt文件的路径
Args:
file_dir: 文件夹路径
filetype: 文件后缀
Returns:
所有filetype类型文件的绝对路径
"""
files_path = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if filetype is None or (os.path.splitext(file)[1] == filetype):
files_path.append(os.path.join(root, file))
return files_path
def get_all_files(current_address):
files = []
for parent, dirnames, filenames in os.walk(current_address):
# Case1: traversal the directories
# for dirname in dirnames:
# print("Parent folder:", parent)
# print("Dirname:", dirname)
# # Case2: traversal the files
for filename in filenames:
# print("Parent folder:", parent)
file_path = os.path.join(parent, filename)
files.append(file_path)
return files
def post(url, params, retry=3, headers=None):
if headers is None:
hdrs = {"Content-Type": "application/json"}
else:
hdrs = headers
fails = 0
while fails < retry:
try:
if headers is None:
data = json.dumps(params)
else:
data = params
logger.debug(f"will post {data} to {url}")
resp = requests.post(url, data, headers=hdrs, timeout=10)
if resp:
logger.info(f"resp {resp.content}")
return resp.json()
else:
logger.error(f"resp: [{resp}]")
fails += 1
except Exception as error:
logger.error(f"post {params} to {url} failed {error}")
fails += 1
if fails > retry:
raise error
def send_topic(web_url):
data_dir = 'data'
book_dir = 'data/深入剖析Nginx/'
# web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
# files = get_files_path('data/全程软件测试(第3版)', '.java')
files = get_all_files(book_dir)
print(files)
mapping_path = 'data/深入剖析Nginx.json'
if not os.path.exists(mapping_path):
chapter_code_mapping = {}
save_mapping = json.dumps(chapter_code_mapping,
ensure_ascii=False,
indent=2)
with open(mapping_path, 'w') as f:
f.write(save_mapping)
with open(mapping_path, 'r') as f:
chapter_code_mapping = json.load(f)
for file in files:
topic_title = file.replace(book_dir, '')
topic_title = topic_title.replace('/', '|')
topic_title = topic_title.replace(' ', '.')
# topic_title = html.escape(topic_title)
topic_content = web_url + file
topic_content = "代码:<a href=\"{}\">{}</a>".format(
topic_content, topic_title)
print(topic_title)
send_topic_request_param = {
"type": "long_text",
"cateId": 20966,
"content": topic_content,
"topicTitle": topic_title,
"mdContent": topic_content,
"communityId": 3821,
"loginUserName": "BBS_Assistant",
"bizNo": "ebook"
}
if chapter_code_mapping.get(file) is None:
resp = post(request_url, send_topic_request_param)
topic_link = resp['data']['content']['url']
chapter_code_mapping[file] = topic_link
print('{}:{}'.format(file, topic_link))
save_mapping = json.dumps(chapter_code_mapping,
ensure_ascii=False,
indent=2)
with open(mapping_path, 'w') as f:
f.write(save_mapping)
else:
send_topic_request_param['id'] = int(
chapter_code_mapping[file].split('/')[-1])
resp = post(request_url, send_topic_request_param)
print('{}:{}'.format(file, chapter_code_mapping.get(file)))
import json
import requests
import logging
logger = logging.getLogger(__name__)
def get_chapter_content(params):
url = 'http://192.168.50.117:9003/v1/chapter/content'
headers = {
"Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result = requests.get(url=url, params=params, headers=headers)
if result.status_code == 200:
ret = json.loads(result.text)
logger.info('request success')
content = ret['data']
return content
else:
logger.info('request failed!!!!!')
return {}
def get_chapter_list(params):
url = 'http://192.168.50.117:9003/inner/v1/chapter/list'
headers = {
"Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
}
result = requests.get(url=url, params=params, headers=headers)
if result.status_code == 200:
ret = json.loads(result.text)
logger.info('request success')
content = ret['data']
return content
else:
logger.info('request failed!!!!!')
return {}
\ No newline at end of file
import json
import os
import re
import html
from bs4 import BeautifulSoup
from .get_book_chapter_id_list import get_chapter_id_list
from .ebook_get_request import get_chapter_content
def extract_code():
# book_mapping_path = "data/book_mapping.json"
# with open(book_mapping_path, "r") as f:
# book_mapping = json.load(f)
book_mapping = {
"深入剖析Nginx": "608fd0c7025a4a34a97a29897b067d24",
}
for book_idx, book_name in enumerate(book_mapping.keys()):
book_dir_name = book_name
book_dir = os.path.join('data', book_dir_name)
if not os.path.exists(book_dir):
os.mkdir(book_dir)
# print(book_dir_name)
book_id = book_mapping[book_name]
request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
chapter_id_list = get_chapter_id_list(
request_get_chapter_id_list_params)
# print(chapter_id_list)
for chapter_id in chapter_id_list:
request_get_chapter_content_params = {
'bookId': book_id,
'chapterId': chapter_id
}
chapter_resp = get_chapter_content(
request_get_chapter_content_params)
chapter_name = chapter_resp['name']
chapter_content = chapter_resp['content']
try:
if book_name == "零基础学机器学习":
chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
chapter_name_modify = re.sub(
r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
chapter_name)
else:
chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
chapter_name_modify = re.sub(
r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
chapter_name)
chapter_name = chapter_name_modify
except:
# print('该章节没有章节序号: {}'.format(chapter_name))
pass
chapter_dir = os.path.join(book_dir, chapter_name)
if not os.path.exists(chapter_dir):
os.mkdir(chapter_dir)
# print('创建文件夹: {}'.format(chapter_dir))
chapter_content = html.unescape(chapter_content)
# print(chapter_content)
if book_name == "深入剖析Nginx":
section_list = re.findall(r'<h2.*?><a>(.*?)</a></h2>',
chapter_content,
flags=re.S)
# print(section_list)
section_content_list = re.split(r'<h2.*?>.*?</h2>',
chapter_content,
flags=re.S)
section_dir_list = []
for section in section_list:
section = section.replace(' ', ' ')
if section.find(r'/') != -1:
section = section.replace('/', '')
section_dir = os.path.join(chapter_dir, section)
# print(section_dir)
if not os.path.exists(section_dir):
os.mkdir(section_dir)
section_dir_list.append(section_dir)
for idx, section_content in enumerate(section_content_list):
if idx == 0:
html_save_path = os.path.join(chapter_dir, 'text.html')
else:
html_save_path = os.path.join(
section_dir_list[idx - 1], 'text.html')
# with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content)
code_list = re.findall(
r'(?:(?:<p class="left">\d{1,5}\:.*? \n).*?)*',
section_content,
flags=re.S)
res_codelist = []
for code in code_list:
if code != '':
res_codelist.append(code)
# print(res_codelist)
# break
count = 0
for code in res_codelist:
code = html.unescape(code)
soup = BeautifulSoup(code)
clean_code = soup.get_text()
print(clean_code)
print('-------' * 10)
pianduan_name = re.findall(r'(代码片段.*),', clean_code)
if pianduan_name == []:
pianduan_name_str = ''
else:
pianduan_name_str = pianduan_name[0]
file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
print(file_name_list)
if file_name_list == []:
file_name = '.txt'
else:
file_name = file_name_list[0]
file_name = file_name.replace('/', '-')
save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name)
if idx == 0:
code_save_path = os.path.join(
chapter_dir, save_file_name)
else:
count += 1
code_save_path = os.path.join(
section_dir_list[idx - 1], save_file_name)
res_code_list = []
for line in clean_code.split('\n'):
if line.find('文件名') != -1 or line.find(
'代码片段') != -1 or line == '':
continue
clean_line = re.findall(r'^\d{1,5}\: *(.*)',
line)[0]
res_code_list.append(clean_line)
res_code = '\n'.join(res_code_list)
with open(code_save_path, 'w', encoding='utf-8') as f:
f.write(res_code)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)
import json
import re
import html
import nltk
import html2text
import os
import pandas as pd
from bs4 import BeautifulSoup
from .ebook_get_request import get_chapter_list
def get_chapter_id_list(param):
chapter_list = []
ret = get_chapter_list(param)
for item in ret:
chapterid = item['chapterid']
chapter_list.append(chapterid)
return chapter_list
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册