import json import re import html import nltk import html2text import os import pandas as pd from bs4 import BeautifulSoup from src.extract.get_book_content import get_chapter_content def extract_structure(): params = { "bookId": "c798a5992a654857867ec15660e1c32a", } book_path = '20211203Python编程无师自通.csv' book_data = pd.read_csv(book_path) chapterid_list = book_data['chapterid'] for chapter_id in chapterid_list: # chapter_id = 87 params['chapterId'] = chapter_id res = get_chapter_content(params) chapter_name = res['name'] chapter_dir = './test_dir/{}'.format(chapter_name) try: chapter_num = re.findall(r'第(.*)章', chapter_dir)[0] print(chapter_num.zfill(2)) chapter_dir = re.sub(r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_dir) print(chapter_dir) except: continue if not os.path.exists(chapter_dir): os.mkdir(chapter_dir) content = res['content'] content = html.unescape(content) section_list = re.findall(r'
(.*?)
', section_content, re.S)
res_code_list = []
count = 0
for i in code_list:
if len(i.split('\n')) < 2:
continue
count+=1
i = html.unescape(i)
soup = BeautifulSoup(i)
res_str = soup.get_text()
if idx == 0:
code_save_dir = os.path.join(chapter_dir, 'code_0.py')
else:
code_save_dir = os.path.join(section_dir_list[idx-1], 'code_{}.py'.format(count))
clean_text_list = []
for line in res_str.split('\n'):
if line == '':
continue
if line[0].isdigit():
line = re.findall(r'^[0-9]+ {0,2}(.*)', line)[0]
# print(line)
else:
if line.startswith('>>'):
break
clean_text_list.append(line)
clean_code = '\n'.join(clean_text_list)
with open(code_save_dir, 'w', encoding='utf-8') as f:
f.write(clean_code)