extract_book_code.py 5.4 KB
Newer Older
ToTensor's avatar
ToTensor 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
import json
import os
import re
import html
from bs4 import BeautifulSoup
from .get_book_chapter_id_list import get_chapter_id_list
from .ebook_get_request import get_chapter_content


def extract_code():

    # book_mapping_path = "data/book_mapping.json"
    # with open(book_mapping_path, "r") as f:
    #     book_mapping = json.load(f)
    book_mapping = {
        "深入剖析Nginx": "08fd0c7025a4a34a97a29897b067d24",
    }
    for book_idx, book_name in enumerate(book_mapping.keys()):
        book_dir_name = book_name
        book_dir = os.path.join('data', book_dir_name)
        if not os.path.exists(book_dir):
            os.mkdir(book_dir)
        print(book_dir_name)
        book_id = book_mapping[book_name]
        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
        chapter_id_list = get_chapter_id_list(
            request_get_chapter_id_list_params)
        print(chapter_id_list)
        for chapter_id in chapter_id_list:
            request_get_chapter_content_params = {
                'bookId': book_id,
                'chapterId': chapter_id
            }
            chapter_resp = get_chapter_content(
                request_get_chapter_content_params)
            chapter_name = chapter_resp['name']
            chapter_content = chapter_resp['content']
            try:
                if book_name == "零基础学机器学习":
                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
                        chapter_name)
                else:
                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
                        chapter_name)
                chapter_name = chapter_name_modify
            except:
                # print('该章节没有章节序号: {}'.format(chapter_name))
                pass
            chapter_dir = os.path.join(book_dir, chapter_name)
            if not os.path.exists(chapter_dir):
                os.mkdir(chapter_dir)
                # print('创建文件夹: {}'.format(chapter_dir))

            chapter_content = html.unescape(chapter_content)

            if book_name == "深入剖析Nginx":
                section_list = re.findall(r'<h2.*>(.*?)</h2>', chapter_content)
                section_content_list = re.split(r'<h2.*?>.*?</h2>',
                                                chapter_content,
                                                flags=re.S)
                section_dir_list = []
                for section in section_list:
                    section = section.replace(' ', ' ')
                    if section.find(r'/') != -1:
                        section = section.replace('/', '')
                    section_dir = os.path.join(chapter_dir, section)
                    # print(section_dir)
                    if not os.path.exists(section_dir):
                        os.mkdir(section_dir)
                    section_dir_list.append(section_dir)
                for idx, section_content in enumerate(section_content_list):
                    if idx == 0:
                        html_save_path = os.path.join(chapter_dir, 'text.html')
                    else:
                        html_save_path = os.path.join(
                            section_dir_list[idx - 1], 'text.html')
                    # with open(html_save_path, 'w', encoding='utf-8') as f:
                    #     f.write(section_content)

                    code_list = re.findall(r'<code>(.*?)</code>',
                                           section_content, re.S)

                    res_code_list = []
                    count = 0
                    for i in code_list:
                        if len(i.split('\n')) < 2:
                            continue
                        count += 1
                        i = html.unescape(i)
                        soup = BeautifulSoup(i)
                        res_str = soup.get_text()

                        if idx == 0:
                            code_save_dir = os.path.join(
                                chapter_dir, 'code_0.java')
                        else:
                            code_save_dir = os.path.join(
                                section_dir_list[idx - 1],
                                'code_{}.java'.format(count))

                        print(code_save_dir)
                        print(res_str)
ToTensor's avatar
ToTensor 已提交
107 108
                        # with open(code_save_dir, 'w', encoding='utf-8') as f:
                        #     f.write(res_str)
ToTensor's avatar
ToTensor 已提交
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123

                        # clean_text_list = []
                        # for line in res_str.split('\n'):
                        #     if line == '':
                        #         continue
                        #     if line[0].isdigit():
                        #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
                        #                           line)[0]
                        #         # print(line)
                        #     else:
                        #         if line.startswith('>>'):
                        #             break
                        #     clean_text_list.append(line)
                        # clean_code = '\n'.join(clean_text_list)
                        # print(clean_code)