extract_book_code.py 6.9 KB
Newer Older
ToTensor's avatar
ToTensor 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
import json
import os
import re
import html
from bs4 import BeautifulSoup
from .get_book_chapter_id_list import get_chapter_id_list
from .ebook_get_request import get_chapter_content


def extract_code(book_mapping):

    # book_mapping_path = "data/book_mapping.json"
    # with open(book_mapping_path, "r") as f:
    #     book_mapping = json.load(f)
    for book_idx, book_name in enumerate(book_mapping.keys()):
        book_dir_name = book_name
        book_dir = os.path.join('data', book_dir_name)
        if not os.path.exists(book_dir):
            os.mkdir(book_dir)
        # print(book_dir_name)
        book_id = book_mapping[book_name]
        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
        chapter_id_list = get_chapter_id_list(
            request_get_chapter_id_list_params)
        print(chapter_id_list)
        for chapter_id in chapter_id_list:
            print('当前章节id: {}'.format(chapter_id))
            request_get_chapter_content_params = {
                'bookId': book_id,
                'chapterId': chapter_id
            }
            chapter_resp = get_chapter_content(
                request_get_chapter_content_params)
            chapter_name = chapter_resp['name']
            chapter_content = chapter_resp['content']
            try:
                if book_name == "零基础学机器学习":
                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
                        chapter_name)
                else:
                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
                        chapter_name)
                chapter_name = chapter_name_modify
                print(chapter_name)
            except:
                print('该章节没有章节序号: {}'.format(chapter_name))
                pass
            chapter_dir = os.path.join(book_dir, chapter_name)
            if not os.path.exists(chapter_dir):
                os.mkdir(chapter_dir)
                # print('创建文件夹: {}'.format(chapter_dir))

            chapter_content = html.unescape(chapter_content)
            # print(chapter_content)

ToTensor's avatar
ToTensor 已提交
60
            section_list = re.findall(r'<h2.*?><a >(.*?)</a></h2>',
ToTensor's avatar
ToTensor 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
                                      chapter_content,
                                      flags=re.S)

            print(section_list)
            section_content_list = re.split(r'<h2.*?>.*?</h2>',
                                            chapter_content,
                                            flags=re.S)
            section_dir_list = []
            for idx, section in enumerate(section_list):
                section = section.replace(' ', ' ')
                if section.find(r'/') != -1:
                    section = section.replace('/', '')
                section_dir = os.path.join(chapter_dir,
                                           '{}.{}'.format(idx + 1, section))
                print(section_dir)
                if not os.path.exists(section_dir):
                    os.mkdir(section_dir)
                section_dir_list.append(section_dir)
            for idx, section_content in enumerate(section_content_list):
                if idx == 0:
                    html_save_path = os.path.join(chapter_dir, 'text.html')
                else:
                    html_save_path = os.path.join(section_dir_list[idx - 1],
                                                  'text.html')
ToTensor's avatar
ToTensor 已提交
85 86
                with open(html_save_path, 'w', encoding='utf-8') as f:
                    f.write(section_content)
ToTensor's avatar
ToTensor 已提交
87

ToTensor's avatar
ToTensor 已提交
88 89 90 91 92 93
                code_list = re.findall(
                    r'(?:(?: *<p class="content_105">[a-zA-Z]+.*? \n).*?)*',
                    section_content,
                    flags=re.DOTALL)

                # print(code_list)
ToTensor's avatar
ToTensor 已提交
94 95 96 97 98 99 100 101 102

                res_codelist = []
                for code in code_list:
                    if code != '':
                        res_codelist.append(code)
                # print(res_codelist)
                # break
                count = 0
                for code in res_codelist:
ToTensor's avatar
ToTensor 已提交
103 104 105 106 107 108
                    # if len(code.split('\n')) < 2:
                    #     continue
                    code = html.unescape(code)
                    soup = BeautifulSoup(code)
                    clean_code = soup.get_text()
                    print(clean_code)
ToTensor's avatar
ToTensor 已提交
109

ToTensor's avatar
ToTensor 已提交
110
                    print('-------' * 10)
ToTensor's avatar
ToTensor 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
                    # pianduan_name = re.findall(r'(代码片段.*),', clean_code)
                    # if pianduan_name == []:
                    #     pianduan_name_str = ''
                    # else:
                    #     pianduan_name_str = pianduan_name[0]
                    # file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
                    # print(file_name_list)
                    # if file_name_list == []:
                    #     file_name = '.txt'
                    # else:
                    #     file_name = file_name_list[0]
                    # file_name = file_name.replace('/', '-')
                    # save_file_name = pianduan_name_str + '-' + file_name
                    # print(save_file_name)

                    if idx == 0:
ToTensor's avatar
ToTensor 已提交
127
                        code_save_path = os.path.join(chapter_dir, 'code_0.py')
ToTensor's avatar
ToTensor 已提交
128 129 130 131
                    else:
                        count += 1
                        code_save_path = os.path.join(
                            section_dir_list[idx - 1],
ToTensor's avatar
ToTensor 已提交
132
                            'code_{}.py'.format(count))
ToTensor's avatar
ToTensor 已提交
133 134 135 136 137 138 139 140 141 142 143 144

                    # res_code_list = []
                    # for line in clean_code.split('\n'):
                    #     if line.find('文件名') != -1 or line.find(
                    #             '代码片段') != -1 or line == '':
                    #         continue
                    #     clean_line = re.findall(r'^\d{1,5}\: *(.*)',
                    #                             line)[0]
                    #     res_code_list.append(clean_line)
                    # res_code = '\n'.join(res_code_list)

                    with open(code_save_path, 'w', encoding='utf-8') as f:
ToTensor's avatar
ToTensor 已提交
145
                        f.write(clean_code)
ToTensor's avatar
ToTensor 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160

                # clean_text_list = []
                # for line in res_str.split('\n'):
                #     if line == '':
                #         continue
                #     if line[0].isdigit():
                #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
                #                           line)[0]
                #         # print(line)
                #     else:
                #         if line.startswith('>>'):
                #             break
                #     clean_text_list.append(line)
                # clean_code = '\n'.join(clean_text_list)
                # print(clean_code)