import json import os import re import html from bs4 import BeautifulSoup from .get_book_chapter_id_list import get_chapter_id_list from .ebook_get_request import get_chapter_content def extract_code(book_mapping): # book_mapping_path = "data/book_mapping.json" # with open(book_mapping_path, "r") as f: # book_mapping = json.load(f) for book_idx, book_name in enumerate(book_mapping.keys()): book_dir_name = book_name book_dir = os.path.join('data', book_dir_name) if not os.path.exists(book_dir): os.mkdir(book_dir) # print(book_dir_name) book_id = book_mapping[book_name] request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} chapter_id_list = get_chapter_id_list( request_get_chapter_id_list_params) print(chapter_id_list) for chapter_id in chapter_id_list: print('当前章节id: {}'.format(chapter_id)) request_get_chapter_content_params = { 'bookId': book_id, 'chapterId': chapter_id } chapter_resp = get_chapter_content( request_get_chapter_content_params) chapter_name = chapter_resp['name'] chapter_content = chapter_resp['content'] try: if book_name == "零基础学机器学习": chapter_num = re.findall(r'第(.*)课', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), chapter_name) else: chapter_num = re.findall(r'第(.*)章', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_name) chapter_name = chapter_name_modify print(chapter_name) except: print('该章节没有章节序号: {}'.format(chapter_name)) pass chapter_dir = os.path.join(book_dir, chapter_name) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) # print('创建文件夹: {}'.format(chapter_dir)) chapter_content = html.unescape(chapter_content) # print(chapter_content) section_list = re.findall(r'
(.*?)
', section_content,
re.S)
res_codelist = []
for code in code_list:
code = code.strip()
if code != '':
res_codelist.append(code)
# print(res_codelist)
# break
count = 0
for code in res_codelist:
if len(code.split('\n')) < 2:
continue
# code = html.unescape(code)
# soup = BeautifulSoup(code)
# clean_code = soup.get_text()
# print(clean_code)
# print('-------' * 10)
# pianduan_name = re.findall(r'(代码片段.*),', clean_code)
# if pianduan_name == []:
# pianduan_name_str = ''
# else:
# pianduan_name_str = pianduan_name[0]
# file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
# print(file_name_list)
# if file_name_list == []:
# file_name = '.txt'
# else:
# file_name = file_name_list[0]
# file_name = file_name.replace('/', '-')
# save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name)
if idx == 0:
code_save_path = os.path.join(chapter_dir,
'code_0.css')
else:
count += 1
code_save_path = os.path.join(
section_dir_list[idx - 1],
'code_{}.css'.format(count))
# res_code_list = []
# for line in clean_code.split('\n'):
# if line.find('文件名') != -1 or line.find(
# '代码片段') != -1 or line == '':
# continue
# clean_line = re.findall(r'^\d{1,5}\: *(.*)',
# line)[0]
# res_code_list.append(clean_line)
# res_code = '\n'.join(res_code_list)
with open(code_save_path, 'w', encoding='utf-8') as f:
f.write(code)
# clean_text_list = []
# for line in res_str.split('\n'):
# if line == '':
# continue
# if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0]
# # print(line)
# else:
# if line.startswith('>>'):
# break
# clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list)
# print(clean_code)