提交 cd3eedc5 编写于 作者: ToTensor's avatar ToTensor

extract success

上级 1ac1d443
......@@ -3,10 +3,13 @@ from src.ebook.community import send_topic
if __name__ == "__main__":
book_mapping = {
"前端体验设计": "c4eeb42b07f54b42a9fd1568b8ec4b98",
"前端体验设计——HTML5+CSS3终极修炼": "c4eeb42b07f54b42a9fd1568b8ec4b98",
}
extract_code(book_mapping)
web_url = 'https://gitcode.net/csdn/content/book_id_c4eeb42b07f54b42a9fd1568b8ec4b98/-/tree/master/'
print('-------' * 20)
print('开始向社区发帖')
# send_topic(web_url)
\ No newline at end of file
for key in book_mapping.keys():
extract_code(book_mapping)
web_url = 'https://gitcode.net/csdn/content/book_id_{}/-/tree/master/'.format(
book_mapping[key])
print('-------' * 20)
print('开始向社区发帖')
book_dir = 'data/{}/'.format(key)
# send_topic(web_url, book_dir)
\ No newline at end of file
......@@ -65,9 +65,8 @@ def post(url, params, retry=3, headers=None):
raise error
def send_topic(web_url):
def send_topic(web_url, book_dir):
data_dir = 'data'
book_dir = 'data/深入剖析Nginx/'
# web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
......
......@@ -57,103 +57,102 @@ def extract_code(book_mapping):
chapter_content = html.unescape(chapter_content)
# print(chapter_content)
if book_name == "前端体验设计":
section_list = re.findall(r'<h2.*?>(.*?)</h2>',
chapter_content,
flags=re.S)
section_list = re.findall(r'<h2.*?>(.*?)</h2>',
chapter_content,
flags=re.S)
print(section_list)
section_content_list = re.split(r'<h2.*?>.*?</h2>',
chapter_content,
flags=re.S)
section_dir_list = []
for idx, section in enumerate(section_list):
section = section.replace(' ', ' ')
if section.find(r'/') != -1:
section = section.replace('/', '')
section_dir = os.path.join(
chapter_dir, '{}.{}'.format(idx + 1, section))
print(section_dir)
if not os.path.exists(section_dir):
os.mkdir(section_dir)
section_dir_list.append(section_dir)
for idx, section_content in enumerate(section_content_list):
if idx == 0:
html_save_path = os.path.join(chapter_dir, 'text.html')
else:
html_save_path = os.path.join(
section_dir_list[idx - 1], 'text.html')
# with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content)
print(section_list)
section_content_list = re.split(r'<h2.*?>.*?</h2>',
chapter_content,
flags=re.S)
section_dir_list = []
for idx, section in enumerate(section_list):
section = section.replace(' ', ' ')
if section.find(r'/') != -1:
section = section.replace('/', '')
section_dir = os.path.join(chapter_dir,
'{}.{}'.format(idx + 1, section))
print(section_dir)
if not os.path.exists(section_dir):
os.mkdir(section_dir)
section_dir_list.append(section_dir)
for idx, section_content in enumerate(section_content_list):
if idx == 0:
html_save_path = os.path.join(chapter_dir, 'text.html')
else:
html_save_path = os.path.join(section_dir_list[idx - 1],
'text.html')
# with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content)
code_list = re.findall(r'<code>(.*?)</code>',
section_content, re.S)
code_list = re.findall(r'<code>(.*?)</code>', section_content,
re.S)
res_codelist = []
for code in code_list:
code = code.strip()
if code != '':
res_codelist.append(code)
# print(res_codelist)
# break
count = 0
for code in res_codelist:
if len(code.split('\n')) < 2:
continue
# code = html.unescape(code)
# soup = BeautifulSoup(code)
# clean_code = soup.get_text()
# print(clean_code)
res_codelist = []
for code in code_list:
code = code.strip()
if code != '':
res_codelist.append(code)
# print(res_codelist)
# break
count = 0
for code in res_codelist:
if len(code.split('\n')) < 2:
continue
# code = html.unescape(code)
# soup = BeautifulSoup(code)
# clean_code = soup.get_text()
# print(clean_code)
# print('-------' * 10)
# pianduan_name = re.findall(r'(代码片段.*),', clean_code)
# if pianduan_name == []:
# pianduan_name_str = ''
# else:
# pianduan_name_str = pianduan_name[0]
# file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
# print(file_name_list)
# if file_name_list == []:
# file_name = '.txt'
# else:
# file_name = file_name_list[0]
# file_name = file_name.replace('/', '-')
# save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name)
# print('-------' * 10)
# pianduan_name = re.findall(r'(代码片段.*),', clean_code)
# if pianduan_name == []:
# pianduan_name_str = ''
# else:
# pianduan_name_str = pianduan_name[0]
# file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
# print(file_name_list)
# if file_name_list == []:
# file_name = '.txt'
# else:
# file_name = file_name_list[0]
# file_name = file_name.replace('/', '-')
# save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name)
if idx == 0:
code_save_path = os.path.join(
chapter_dir, 'code_0.css')
else:
count += 1
code_save_path = os.path.join(
section_dir_list[idx - 1],
'code_{}.css'.format(count))
if idx == 0:
code_save_path = os.path.join(chapter_dir,
'code_0.css')
else:
count += 1
code_save_path = os.path.join(
section_dir_list[idx - 1],
'code_{}.css'.format(count))
# res_code_list = []
# for line in clean_code.split('\n'):
# if line.find('文件名') != -1 or line.find(
# '代码片段') != -1 or line == '':
# continue
# clean_line = re.findall(r'^\d{1,5}\: *(.*)',
# line)[0]
# res_code_list.append(clean_line)
# res_code = '\n'.join(res_code_list)
# res_code_list = []
# for line in clean_code.split('\n'):
# if line.find('文件名') != -1 or line.find(
# '代码片段') != -1 or line == '':
# continue
# clean_line = re.findall(r'^\d{1,5}\: *(.*)',
# line)[0]
# res_code_list.append(clean_line)
# res_code = '\n'.join(res_code_list)
with open(code_save_path, 'w', encoding='utf-8') as f:
f.write(code)
with open(code_save_path, 'w', encoding='utf-8') as f:
f.write(code)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册