import json import os import re import html from bs4 import BeautifulSoup from .get_book_chapter_id_list import get_chapter_id_list from .ebook_get_request import get_chapter_content def extract_code(): # book_mapping_path = "data/book_mapping.json" # with open(book_mapping_path, "r") as f: # book_mapping = json.load(f) book_mapping = { "全程软件测试(第3版)": "825acb73c85c4c4bb9632afe858bc097", } for book_idx, book_name in enumerate(book_mapping.keys()): if book_name == 'Python编程无师自通': continue book_dir_name = book_name book_dir = os.path.join('data', book_dir_name) if not os.path.exists(book_dir): os.mkdir(book_dir) print(book_dir_name) book_id = book_mapping[book_name] request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} chapter_id_list = get_chapter_id_list( request_get_chapter_id_list_params) print(chapter_id_list) for chapter_id in chapter_id_list: request_get_chapter_content_params = { 'bookId': book_id, 'chapterId': chapter_id } chapter_resp = get_chapter_content( request_get_chapter_content_params) chapter_name = chapter_resp['name'] chapter_content = chapter_resp['content'] try: if book_name == "零基础学机器学习": chapter_num = re.findall(r'第(.*)课', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), chapter_name) else: chapter_num = re.findall(r'第(.*)章', chapter_name)[0] chapter_name_modify = re.sub( r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_name) chapter_name = chapter_name_modify except: # print('该章节没有章节序号: {}'.format(chapter_name)) pass chapter_dir = os.path.join(book_dir, chapter_name) if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) # print('创建文件夹: {}'.format(chapter_dir)) chapter_content = html.unescape(chapter_content) if book_name == "全程软件测试(第3版)": section_list = re.findall(r'(.*?)', chapter_content) section_content_list = re.split(r'.*?', chapter_content, flags=re.S) section_dir_list = [] for section in section_list: section = section.replace(' ', ' ') if section.find(r'/') != -1: section = section.replace('/', '') section_dir = os.path.join(chapter_dir, section) print(section_dir) if not os.path.exists(section_dir): os.mkdir(section_dir) section_dir_list.append(section_dir) for idx, section_content in enumerate(section_content_list): if idx == 0: html_save_path = os.path.join(chapter_dir, 'text.html') else: html_save_path = os.path.join( section_dir_list[idx - 1], 'text.html') with open(html_save_path, 'w', encoding='utf-8') as f: f.write(section_content) code_list = re.findall(r'(.*?)', section_content, re.S) res_code_list = [] count = 0 for i in code_list: if len(i.split('\n')) < 2: continue count += 1 i = html.unescape(i) soup = BeautifulSoup(i) res_str = soup.get_text() if idx == 0: code_save_dir = os.path.join( chapter_dir, 'code_0.java') else: code_save_dir = os.path.join( section_dir_list[idx - 1], 'code_{}.java'.format(count)) print(res_str) with open(code_save_dir, 'w', encoding='utf-8') as f: f.write(res_str) # clean_text_list = [] # for line in res_str.split('\n'): # if line == '': # continue # if line[0].isdigit(): # line = re.findall(r'^[0-9]+ {0,2}(.*)', # line)[0] # # print(line) # else: # if line.startswith('>>'): # break # clean_text_list.append(line) # clean_code = '\n'.join(clean_text_list) # print(clean_code)