import json import os import re import html from bs4 import BeautifulSoup from .get_book_chapter_id_list import get_chapter_id_list from .ebook_get_request import get_chapter_content def extract_code(book_mapping): # book_mapping_path = "data/book_mapping.json" # with open(book_mapping_path, "r") as f: # book_mapping = json.load(f) for book_idx, book_name in enumerate(book_mapping.keys()): book_dir_name = book_name book_dir = os.path.join('data', book_dir_name) if not os.path.exists(book_dir): os.mkdir(book_dir) # print(book_dir_name) book_id = book_mapping[book_name] request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} chapter_id_list = get_chapter_id_list( request_get_chapter_id_list_params) print(chapter_id_list) for chapter_id in chapter_id_list: print('当前章节id: {}'.format(chapter_id)) request_get_chapter_content_params = { 'bookId': book_id, 'chapterId': chapter_id } chapter_resp = get_chapter_content( request_get_chapter_content_params) chapter_name = chapter_resp['name'] chapter_content = chapter_resp['content'] try: if book_name == "零基础学机器学习": chapter_num = re.findall(r'第(.*)课', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), chapter_name) else: chapter_num = re.findall(r'第(.*)章', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_name) chapter_name = chapter_name_modify print(chapter_name) except: print('该章节没有章节序号： {}'.format(chapter_name)) pass chapter_dir = os.path.join(book_dir, chapter_name) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) # print('创建文件夹： {}'.format(chapter_dir)) chapter_content = html.unescape(chapter_content) # print(chapter_content) section_list = re.findall(r'(.*?)', chapter_content, flags=re.S) print(section_list) section_content_list = re.split(r'.*?', chapter_content, flags=re.S) section_dir_list = [] for idx, section in enumerate(section_list): section = section.replace('　', ' ') if section.find(r'/') != -1: section = section.replace('/', '') section_dir = os.path.join(chapter_dir, '{}.{}'.format(idx + 1, section)) print(section_dir) if not os.path.exists(section_dir): os.mkdir(section_dir) section_dir_list.append(section_dir) for idx, section_content in enumerate(section_content_list): if idx == 0: html_save_path = os.path.join(chapter_dir, 'text.html') else: html_save_path = os.path.join(section_dir_list[idx - 1], 'text.html') # with open(html_save_path, 'w', encoding='utf-8') as f: # f.write(section_content) code_list = re.findall( r'(?:(?: *

[a-zA-Z]+.*? \n).*?)*', section_content, flags=re.DOTALL) # print(code_list) res_codelist = [] for code in code_list: if code != '': res_codelist.append(code) # print(res_codelist) # break count = 0 for code in res_codelist: # if len(code.split('\n')) < 2: # continue code = html.unescape(code) soup = BeautifulSoup(code) clean_code = soup.get_text() print(clean_code) print('-------' * 10) # pianduan_name = re.findall(r'(代码片段.*)，', clean_code) # if pianduan_name == []: # pianduan_name_str = '' # else: # pianduan_name_str = pianduan_name[0] # file_name_list = re.findall(r'文件名: (.*)\n', clean_code) # print(file_name_list) # if file_name_list == []: # file_name = '.txt' # else: # file_name = file_name_list[0] # file_name = file_name.replace('/', '-') # save_file_name = pianduan_name_str + '-' + file_name # print(save_file_name) if idx == 0: code_save_path = os.path.join(chapter_dir, 'code_0.py') else: count += 1 code_save_path = os.path.join( section_dir_list[idx - 1], 'code_{}.py'.format(count)) # res_code_list = [] # for line in clean_code.split('\n'): # if line.find('文件名') != -1 or line.find( # '代码片段') != -1 or line == '': # continue # clean_line = re.findall(r'^\d{1,5}\: *(.*)', # line)[0] # res_code_list.append(clean_line) # res_code = '\n'.join(res_code_list) with open(code_save_path, 'w', encoding='utf-8') as f: f.write(clean_code) # clean_text_list = [] # for line in res_str.split('\n'): # if line == '': # continue # if line[0].isdigit(): # line = re.findall(r'^[0-9]+ {0,2}(.*)', # line)[0] # # print(line) # else: # if line.startswith('>>'): # break # clean_text_list.append(line) # clean_code = '\n'.join(clean_text_list) # print(clean_code)