import json import os import re import html from bs4 import BeautifulSoup from .get_book_chapter_id_list import get_chapter_id_list from .ebook_get_request import get_chapter_content def extract_code(book_mapping): # book_mapping_path = "data/book_mapping.json" # with open(book_mapping_path, "r") as f: # book_mapping = json.load(f) for book_idx, book_name in enumerate(book_mapping.keys()): book_dir_name = book_name book_dir = os.path.join('data', book_dir_name) if not os.path.exists(book_dir): os.mkdir(book_dir) # print(book_dir_name) book_id = book_mapping[book_name] request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} chapter_id_list = get_chapter_id_list( request_get_chapter_id_list_params) # print(chapter_id_list) for chapter_id in chapter_id_list: request_get_chapter_content_params = { 'bookId': book_id, 'chapterId': chapter_id } chapter_resp = get_chapter_content( request_get_chapter_content_params) chapter_name = chapter_resp['name'] chapter_content = chapter_resp['content'] try: if book_name == "零基础学机器学习": chapter_num = re.findall(r'第(.*)课', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), chapter_name) else: chapter_num = re.findall(r'第(.*)章', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_name) chapter_name = chapter_name_modify except: # print('该章节没有章节序号: {}'.format(chapter_name)) pass chapter_dir = os.path.join(book_dir, chapter_name) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) # print('创建文件夹: {}'.format(chapter_dir)) chapter_content = html.unescape(chapter_content) # print(chapter_content) if book_name == "前端体验设计": section_list = re.findall(r'(.*?)', chapter_content, flags=re.S) # print(section_list) section_content_list = re.split(r'.*?', chapter_content, flags=re.S) section_dir_list = [] for section in section_list: section = section.replace(' ', ' ') if section.find(r'/') != -1: section = section.replace('/', '') section_dir = os.path.join(chapter_dir, section) # print(section_dir) if not os.path.exists(section_dir): os.mkdir(section_dir) section_dir_list.append(section_dir) for idx, section_content in enumerate(section_content_list): if idx == 0: html_save_path = os.path.join(chapter_dir, 'text.html') else: html_save_path = os.path.join( section_dir_list[idx - 1], 'text.html') # with open(html_save_path, 'w', encoding='utf-8') as f: # f.write(section_content) code_list = re.findall( r'(?:(?:

\d{1,5}\:.*? \n).*?)*', section_content, flags=re.S) res_codelist = [] for code in code_list: if code != '': res_codelist.append(code) # print(res_codelist) # break count = 0 for code in res_codelist: code = html.unescape(code) soup = BeautifulSoup(code) clean_code = soup.get_text() print(clean_code) print('-------' * 10) pianduan_name = re.findall(r'(代码片段.*),', clean_code) if pianduan_name == []: pianduan_name_str = '' else: pianduan_name_str = pianduan_name[0] file_name_list = re.findall(r'文件名: (.*)\n', clean_code) print(file_name_list) if file_name_list == []: file_name = '.txt' else: file_name = file_name_list[0] file_name = file_name.replace('/', '-') save_file_name = pianduan_name_str + '-' + file_name # print(save_file_name) if idx == 0: code_save_path = os.path.join( chapter_dir, save_file_name) else: count += 1 code_save_path = os.path.join( section_dir_list[idx - 1], save_file_name) res_code_list = [] for line in clean_code.split('\n'): if line.find('文件名') != -1 or line.find( '代码片段') != -1 or line == '': continue clean_line = re.findall(r'^\d{1,5}\: *(.*)', line)[0] res_code_list.append(clean_line) res_code = '\n'.join(res_code_list) with open(code_save_path, 'w', encoding='utf-8') as f: f.write(res_code) # clean_text_list = [] # for line in res_str.split('\n'): # if line == '': # continue # if line[0].isdigit(): # line = re.findall(r'^[0-9]+ {0,2}(.*)', # line)[0] # # print(line) # else: # if line.startswith('>>'): # break # clean_text_list.append(line) # clean_code = '\n'.join(clean_text_list) # print(clean_code)