import json import re import html import nltk import html2text import os import pandas as pd from bs4 import BeautifulSoup from src.extract.get_book_content import get_chapter_content def extract_structure(): params = { "bookId": "c798a5992a654857867ec15660e1c32a", } book_path = '20211203Python编程无师自通.csv' book_data = pd.read_csv(book_path) chapterid_list = book_data['chapterid'] for chapter_id in chapterid_list: # chapter_id = 87 params['chapterId'] = chapter_id res = get_chapter_content(params) chapter_name = res['name'] chapter_dir = './test_dir/{}'.format(chapter_name) try: chapter_num = re.findall(r'第(.*)章', chapter_dir)[0] print(chapter_num.zfill(2)) chapter_dir = re.sub(r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_dir) print(chapter_dir) except: continue if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) content = res['content'] content = html.unescape(content) section_list = re.findall(r'(.*?)', content) section_content_list = re.split(r'.*?', content, flags=re.S) section_dir_list = [] for section in section_list: section_dir = os.path.join(chapter_dir, section) if not os.path.exists(section_dir): os.mkdir(section_dir) section_dir_list.append(section_dir) # print(section_dir_list) for idx, section_content in enumerate(section_content_list): if idx == 0: save_dir = os.path.join(chapter_dir, 'text.html') else: save_dir = os.path.join(section_dir_list[idx-1], 'text.html') # with open(save_dir, 'w', encoding='utf-8') as f: # f.write(section_content) code_list = re.findall(r'(.*?)', section_content, re.S) res_code_list = [] count = 0 for i in code_list: if len(i.split('\n')) < 2: continue count+=1 i = html.unescape(i) soup = BeautifulSoup(i) res_str = soup.get_text() if idx == 0: code_save_dir = os.path.join(chapter_dir, 'code_0.py') else: code_save_dir = os.path.join(section_dir_list[idx-1], 'code_{}.py'.format(count)) clean_text_list = [] for line in res_str.split('\n'): if line == '': continue if line[0].isdigit(): line = re.findall(r'^[0-9]+ {0,2}(.*)', line)[0] # print(line) else: if line.startswith('>>'): break clean_text_list.append(line) clean_code = '\n'.join(clean_text_list) with open(code_save_dir, 'w', encoding='utf-8') as f: f.write(clean_code)