import json import os import re import html from bs4 import BeautifulSoup from .get_book_chapter_id_list import get_chapter_id_list from .ebook_get_request import get_chapter_content def extract_code(): # book_mapping_path = "data/book_mapping.json" # with open(book_mapping_path, "r") as f: # book_mapping = json.load(f) book_mapping = { "深入剖析Nginx": "608fd0c7025a4a34a97a29897b067d24", } for book_idx, book_name in enumerate(book_mapping.keys()): book_dir_name = book_name book_dir = os.path.join('data', book_dir_name) if not os.path.exists(book_dir): os.mkdir(book_dir) print(book_dir_name) book_id = book_mapping[book_name] request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} chapter_id_list = get_chapter_id_list( request_get_chapter_id_list_params) print(chapter_id_list) for chapter_id in chapter_id_list: request_get_chapter_content_params = { 'bookId': book_id, 'chapterId': chapter_id } chapter_resp = get_chapter_content( request_get_chapter_content_params) chapter_name = chapter_resp['name'] chapter_content = chapter_resp['content'] try: if book_name == "零基础学机器学习": chapter_num = re.findall(r'第(.*)课', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), chapter_name) else: chapter_num = re.findall(r'第(.*)章', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_name) chapter_name = chapter_name_modify except: # print('该章节没有章节序号: {}'.format(chapter_name)) pass chapter_dir = os.path.join(book_dir, chapter_name) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) # print('创建文件夹: {}'.format(chapter_dir)) chapter_content = html.unescape(chapter_content) if book_name == "深入剖析Nginx": section_list = re.findall(r'
(.*?)
',
section_content, re.S)
res_code_list = []
count = 0
for i in code_list:
if len(i.split('\n')) < 2:
continue
count += 1
i = html.unescape(i)
soup = BeautifulSoup(i)
res_str = soup.get_text()
if idx == 0:
code_save_dir = os.path.join(
chapter_dir, 'code_0.java')
else:
code_save_dir = os.path.join(
section_dir_list[idx - 1],
'code_{}.java'.format(count))
print(code_save_dir)
print(res_str)
# with open(code_save_dir, 'w', encoding='utf-8') as f:
# f.write(res_str)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)