extract_book_code.py

import json
import os
import re
import html
from bs4 import BeautifulSoup
from .get_book_chapter_id_list import get_chapter_id_list
from .ebook_get_request import get_chapter_content


def extract_code():

    # book_mapping_path = "data/book_mapping.json"
    # with open(book_mapping_path, "r") as f:
    #     book_mapping = json.load(f)
    book_mapping = {
        "深入剖析Nginx": "608fd0c7025a4a34a97a29897b067d24",
    }
    for book_idx, book_name in enumerate(book_mapping.keys()):
        book_dir_name = book_name
        book_dir = os.path.join('data', book_dir_name)
        if not os.path.exists(book_dir):
            os.mkdir(book_dir)
        # print(book_dir_name)
        book_id = book_mapping[book_name]
        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
        chapter_id_list = get_chapter_id_list(
            request_get_chapter_id_list_params)
        # print(chapter_id_list)
        for chapter_id in chapter_id_list:
            request_get_chapter_content_params = {
                'bookId': book_id,
                'chapterId': chapter_id
            }
            chapter_resp = get_chapter_content(
                request_get_chapter_content_params)
            chapter_name = chapter_resp['name']
            chapter_content = chapter_resp['content']
            try:
                if book_name == "零基础学机器学习":
                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
                        chapter_name)
                else:
                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
                    chapter_name_modify = re.sub(
                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
                        chapter_name)
                chapter_name = chapter_name_modify
            except:
                # print('该章节没有章节序号： {}'.format(chapter_name))
                pass
            chapter_dir = os.path.join(book_dir, chapter_name)
            if not os.path.exists(chapter_dir):
                os.mkdir(chapter_dir)
                # print('创建文件夹： {}'.format(chapter_dir))

            chapter_content = html.unescape(chapter_content)
            # print(chapter_content)

            if book_name == "深入剖析Nginx":
                section_list = re.findall(r'<h2.*?><a>(.*?)</a></h2>',
                                          chapter_content,
                                          flags=re.S)

                # print(section_list)
                section_content_list = re.split(r'<h2.*?>.*?</h2>',
                                                chapter_content,
                                                flags=re.S)
                section_dir_list = []
                for section in section_list:
                    section = section.replace('　', ' ')
                    if section.find(r'/') != -1:
                        section = section.replace('/', '')
                    section_dir = os.path.join(chapter_dir, section)
                    # print(section_dir)
                    if not os.path.exists(section_dir):
                        os.mkdir(section_dir)
                    section_dir_list.append(section_dir)
                for idx, section_content in enumerate(section_content_list):
                    if idx == 0:
                        html_save_path = os.path.join(chapter_dir, 'text.html')
                    else:
                        html_save_path = os.path.join(
                            section_dir_list[idx - 1], 'text.html')
                    # with open(html_save_path, 'w', encoding='utf-8') as f:
                    #     f.write(section_content)

                    code_list = re.findall(
                        r'(?:(?:<p class="left">\d{1,5}\:.*? \n).*?)*',
                        section_content,
                        flags=re.S)

                    res_codelist = []
                    for code in code_list:
                        if code != '':
                            res_codelist.append(code)
                    # print(res_codelist)
                    # break
                    count = 0
                    for code in res_codelist:
                        code = html.unescape(code)
                        soup = BeautifulSoup(code)
                        clean_code = soup.get_text()
                        print(clean_code)

                        print('-------' * 10)
                        pianduan_name = re.findall(r'(代码片段.*)，', clean_code)
                        if pianduan_name == []:
                            pianduan_name_str = ''
                        else:
                            pianduan_name_str = pianduan_name[0]
                        file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
                        print(file_name_list)
                        if file_name_list == []:
                            file_name = '.txt'
                        else:
                            file_name = file_name_list[0]
                        file_name = file_name.replace('/', '-')
                        save_file_name = pianduan_name_str + '-' + file_name
                        # print(save_file_name)

                        if idx == 0:
                            code_save_path = os.path.join(
                                chapter_dir, save_file_name)
                        else:
                            count += 1
                            code_save_path = os.path.join(
                                section_dir_list[idx - 1], save_file_name)

                        res_code_list = []
                        for line in clean_code.split('\n'):
                            if line.find('文件名') != -1 or line.find(
                                    '代码片段') != -1 or line == '':
                                continue
                            clean_line = re.findall(r'^\d{1,5}\: *(.*)',
                                                    line)[0]
                            res_code_list.append(clean_line)
                        res_code = '\n'.join(res_code_list)

                        with open(code_save_path, 'w', encoding='utf-8') as f:
                            f.write(res_code)

                    # clean_text_list = []
                    # for line in res_str.split('\n'):
                    #     if line == '':
                    #         continue
                    #     if line[0].isdigit():
                    #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
                    #                           line)[0]
                    #         # print(line)
                    #     else:
                    #         if line.startswith('>>'):
                    #             break
                    #     clean_text_list.append(line)
                    # clean_code = '\n'.join(clean_text_list)
                    # print(clean_code)