extract_e_book_structure.py 3.1 KB
Newer Older
ToTensor's avatar
ToTensor 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
import json
import re
import html
import nltk
import html2text
import os
import pandas as pd
from bs4 import BeautifulSoup
from src.extract.get_book_content import get_chapter_content

def extract_structure():
    params = {
        "bookId": "c798a5992a654857867ec15660e1c32a",
    }

    book_path  = '20211203Python编程无师自通.csv'

    book_data = pd.read_csv(book_path)

    chapterid_list = book_data['chapterid']
    for chapter_id in chapterid_list:
        # chapter_id = 87
        params['chapterId'] = chapter_id
        res = get_chapter_content(params)
        chapter_name = res['name']
        chapter_dir = './test_dir/{}'.format(chapter_name)
        try:
            chapter_num = re.findall(r'第(.*)章', chapter_dir)[0]
            print(chapter_num.zfill(2))
            chapter_dir = re.sub(r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), chapter_dir)
            print(chapter_dir)
        except:
            continue
        if not os.path.exists(chapter_dir):
            os.mkdir(chapter_dir)
        content = res['content']
        content = html.unescape(content)

        section_list = re.findall(r'<h2.*>(.*?)</h2>', content)

        section_content_list = re.split(r'<h2.*?>.*?</h2>', content, flags=re.S)

        section_dir_list = []
        for section in section_list:
            section_dir = os.path.join(chapter_dir, section)
            if not os.path.exists(section_dir):
                os.mkdir(section_dir)
            section_dir_list.append(section_dir)
        # print(section_dir_list)

        for idx, section_content in enumerate(section_content_list):
            if idx == 0:
                save_dir = os.path.join(chapter_dir, 'text.html')
            else:
                save_dir = os.path.join(section_dir_list[idx-1], 'text.html')
            # with open(save_dir, 'w', encoding='utf-8') as f:
            #     f.write(section_content)
            
            code_list = re.findall(r'<code>(.*?)</code>', section_content, re.S)

            res_code_list = []


            count = 0
            for i in code_list:
                if len(i.split('\n')) < 2:
                    continue
                count+=1
                i = html.unescape(i)
                soup = BeautifulSoup(i)
                res_str = soup.get_text()

                if idx == 0:
                    code_save_dir = os.path.join(chapter_dir, 'code_0.py')
                else:
                    code_save_dir = os.path.join(section_dir_list[idx-1], 'code_{}.py'.format(count))

                clean_text_list = []
                for line in res_str.split('\n'):
                    if line == '':
                        continue
                    if line[0].isdigit():
                        line = re.findall(r'^[0-9]+ {0,2}(.*)', line)[0]
                        # print(line)
                    else:
                        if line.startswith('>>'):
                            break
                    clean_text_list.append(line)
                clean_code = '\n'.join(clean_text_list)

                with open(code_save_dir, 'w', encoding='utf-8') as f:
                    f.write(clean_code)