extract_book_code.py 7.2 KB
Newer Older
ToTensor's avatar
ToTensor 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
import json
import os
import re
import html
from bs4 import BeautifulSoup
from .get_book_chapter_id_list import get_chapter_id_list
from .ebook_get_request import get_chapter_content


def extract_code(book_mapping):

    # book_mapping_path = "data/book_mapping.json"
    # with open(book_mapping_path, "r") as f:
    #     book_mapping = json.load(f)
    for book_idx, book_name in enumerate(book_mapping.keys()):
        book_dir_name = book_name
        book_dir = os.path.join('data', book_dir_name)
        if not os.path.exists(book_dir):
            os.mkdir(book_dir)
        # print(book_dir_name)
        book_id = book_mapping[book_name]
        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
        chapter_id_list = get_chapter_id_list(
            request_get_chapter_id_list_params)
ToTensor's avatar
ToTensor 已提交
25
        print(chapter_id_list)
ToTensor's avatar
ToTensor 已提交
26
        for chapter_id in chapter_id_list:
ToTensor's avatar
ToTensor 已提交
27
            print('当前章节id: {}'.format(chapter_id))
ToTensor's avatar
ToTensor 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
            request_get_chapter_content_params = {
                'bookId': book_id,
                'chapterId': chapter_id
            }
            chapter_resp = get_chapter_content(
                request_get_chapter_content_params)
            chapter_name = chapter_resp['name']
            chapter_content = chapter_resp['content']
            try:
                if book_name == "零基础学机器学习":
                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
                        chapter_name)
                else:
                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
                        chapter_name)
                chapter_name = chapter_name_modify
ToTensor's avatar
ToTensor 已提交
48
                print(chapter_name)
ToTensor's avatar
ToTensor 已提交
49
            except:
ToTensor's avatar
ToTensor 已提交
50
                print('该章节没有章节序号: {}'.format(chapter_name))
ToTensor's avatar
ToTensor 已提交
51 52 53 54 55 56 57 58 59 60
                pass
            chapter_dir = os.path.join(book_dir, chapter_name)
            if not os.path.exists(chapter_dir):
                os.mkdir(chapter_dir)
                # print('创建文件夹: {}'.format(chapter_dir))

            chapter_content = html.unescape(chapter_content)
            # print(chapter_content)

            if book_name == "前端体验设计":
ToTensor's avatar
ToTensor 已提交
61
                section_list = re.findall(r'<h2.*?>(.*?)</h2>',
ToTensor's avatar
ToTensor 已提交
62 63 64
                                          chapter_content,
                                          flags=re.S)

ToTensor's avatar
ToTensor 已提交
65
                print(section_list)
ToTensor's avatar
ToTensor 已提交
66 67 68 69
                section_content_list = re.split(r'<h2.*?>.*?</h2>',
                                                chapter_content,
                                                flags=re.S)
                section_dir_list = []
ToTensor's avatar
ToTensor 已提交
70
                for idx, section in enumerate(section_list):
ToTensor's avatar
ToTensor 已提交
71 72 73
                    section = section.replace(' ', ' ')
                    if section.find(r'/') != -1:
                        section = section.replace('/', '')
ToTensor's avatar
ToTensor 已提交
74 75 76
                    section_dir = os.path.join(
                        chapter_dir, '{}.{}'.format(idx + 1, section))
                    print(section_dir)
ToTensor's avatar
ToTensor 已提交
77 78 79 80 81 82 83 84 85
                    if not os.path.exists(section_dir):
                        os.mkdir(section_dir)
                    section_dir_list.append(section_dir)
                for idx, section_content in enumerate(section_content_list):
                    if idx == 0:
                        html_save_path = os.path.join(chapter_dir, 'text.html')
                    else:
                        html_save_path = os.path.join(
                            section_dir_list[idx - 1], 'text.html')
ToTensor's avatar
ToTensor 已提交
86 87
                    # with open(html_save_path, 'w', encoding='utf-8') as f:
                    #     f.write(section_content)
ToTensor's avatar
ToTensor 已提交
88

ToTensor's avatar
ToTensor 已提交
89 90
                    code_list = re.findall(r'<code>(.*?)</code>',
                                           section_content, re.S)
ToTensor's avatar
ToTensor 已提交
91 92 93

                    res_codelist = []
                    for code in code_list:
ToTensor's avatar
ToTensor 已提交
94
                        code = code.strip()
ToTensor's avatar
ToTensor 已提交
95 96 97 98 99 100
                        if code != '':
                            res_codelist.append(code)
                    # print(res_codelist)
                    # break
                    count = 0
                    for code in res_codelist:
ToTensor's avatar
ToTensor 已提交
101 102 103 104 105 106
                        if len(code.split('\n')) < 2:
                            continue
                        # code = html.unescape(code)
                        # soup = BeautifulSoup(code)
                        # clean_code = soup.get_text()
                        # print(clean_code)
ToTensor's avatar
ToTensor 已提交
107

ToTensor's avatar
ToTensor 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121
                        # print('-------' * 10)
                        # pianduan_name = re.findall(r'(代码片段.*),', clean_code)
                        # if pianduan_name == []:
                        #     pianduan_name_str = ''
                        # else:
                        #     pianduan_name_str = pianduan_name[0]
                        # file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
                        # print(file_name_list)
                        # if file_name_list == []:
                        #     file_name = '.txt'
                        # else:
                        #     file_name = file_name_list[0]
                        # file_name = file_name.replace('/', '-')
                        # save_file_name = pianduan_name_str + '-' + file_name
ToTensor's avatar
ToTensor 已提交
122 123 124 125
                        # print(save_file_name)

                        if idx == 0:
                            code_save_path = os.path.join(
ToTensor's avatar
ToTensor 已提交
126
                                chapter_dir, 'code_0.css')
ToTensor's avatar
ToTensor 已提交
127 128 129
                        else:
                            count += 1
                            code_save_path = os.path.join(
ToTensor's avatar
ToTensor 已提交
130 131
                                section_dir_list[idx - 1],
                                'code_{}.css'.format(count))
ToTensor's avatar
ToTensor 已提交
132

ToTensor's avatar
ToTensor 已提交
133 134 135 136 137 138 139 140 141
                        # res_code_list = []
                        # for line in clean_code.split('\n'):
                        #     if line.find('文件名') != -1 or line.find(
                        #             '代码片段') != -1 or line == '':
                        #         continue
                        #     clean_line = re.findall(r'^\d{1,5}\: *(.*)',
                        #                             line)[0]
                        #     res_code_list.append(clean_line)
                        # res_code = '\n'.join(res_code_list)
ToTensor's avatar
ToTensor 已提交
142

ToTensor's avatar
ToTensor 已提交
143 144
                        with open(code_save_path, 'w', encoding='utf-8') as f:
                            f.write(code)
ToTensor's avatar
ToTensor 已提交
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159

                    # clean_text_list = []
                    # for line in res_str.split('\n'):
                    #     if line == '':
                    #         continue
                    #     if line[0].isdigit():
                    #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
                    #                           line)[0]
                    #         # print(line)
                    #     else:
                    #         if line.startswith('>>'):
                    #             break
                    #     clean_text_list.append(line)
                    # clean_code = '\n'.join(clean_text_list)
                    # print(clean_code)