modify ignore

3b227029 · ToTensor · 3ca29f69 · 3b227029 · 3ca29f69 · 3ca29f69
7 changed file
--- a/.gitignore
+++ b/.gitignore
 __pycache__
\ No newline at end of file
+src
+main.py
+data/前端体验设计——HTML5+CSS3终极修炼.json
\ No newline at end of file
--- a/data/前端体验设计——HTML5+CSS3终极修炼.json
+++ b/data/前端体验设计——HTML5+CSS3终极修炼.json
--- a/main.py
+++ b/main.py
-from src.ebook.extract_book_code import extract_code
-from src.ebook.community import send_topic
-if __name__ == "__main__":
-    book_mapping = {
-        "前端体验设计——HTML5+CSS3终极修炼": "c4eeb42b07f54b42a9fd1568b8ec4b98",
-    }
-    for key in book_mapping.keys():
-        extract_code(book_mapping)
-        web_url = 'https://gitcode.net/csdn/content/book_code_{}/-/tree/master/'.format(
-            book_mapping[key])
-        print('-------' * 20)
-        print('开始向社区发帖')
-        book_dir = 'data/{}/'.format(key)
-        mapping_path = 'data/{}.json'.format(key)
-        send_topic(web_url, book_dir, mapping_path)
\ No newline at end of file
--- a/src/ebook/community.py
+++ b/src/ebook/community.py
-import os
-import json
-import html
-import requests
-import logging
-logger = logging.getLogger(__name__)
-def get_files_path(file_dir, filetype='.txt'):
-    """得到文件夹下的所有.txt文件的路径
-    Args:
-        file_dir: 文件夹路径
-        filetype: 文件后缀
-    Returns:
-        所有filetype类型文件的绝对路径
-    """
-    files_path = []
-    for root, dirs, files in os.walk(file_dir):
-        for file in files:
-            if filetype is None or (os.path.splitext(file)[1] == filetype):
-                files_path.append(os.path.join(root, file))
-    return files_path
-def get_all_files(current_address):
-    files = []
-    for parent, dirnames, filenames in os.walk(current_address):
-        # Case1: traversal the directories
-        # for dirname in dirnames:
-        #     print("Parent folder:", parent)
-        #     print("Dirname:", dirname)
-        # # Case2: traversal the files
-        for filename in filenames:
-            # print("Parent folder:", parent)
-            file_path = os.path.join(parent, filename)
-            files.append(file_path)
-    return files
-def post(url, params, retry=3, headers=None):
-    if headers is None:
-        hdrs = {"Content-Type": "application/json"}
-    else:
-        hdrs = headers
-    fails = 0
-    while fails < retry:
-        try:
-            if headers is None:
-                data = json.dumps(params)
-            else:
-                data = params
-            logger.debug(f"will post {data} to {url}")
-            resp = requests.post(url, data, headers=hdrs, timeout=10)
-            if resp:
-                logger.info(f"resp {resp.content}")
-                return resp.json()
-            else:
-                logger.error(f"resp: [{resp}]")
-                fails += 1
-        except Exception as error:
-            logger.error(f"post {params} to {url} failed {error}")
-            fails += 1
-            if fails > retry:
-                raise error
-def send_topic(web_url, book_dir, mapping_path):
-    data_dir = 'data'
-    # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
-    request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
-    # files = get_files_path('data/全程软件测试（第3版）', '.java')
-    files = get_all_files(book_dir)
-    print(files)
-    if not os.path.exists(mapping_path):
-        chapter_code_mapping = {}
-        save_mapping = json.dumps(chapter_code_mapping,
-                                  ensure_ascii=False,
-                                  indent=2)
-        with open(mapping_path, 'w') as f:
-            f.write(save_mapping)
-    with open(mapping_path, 'r') as f:
-        chapter_code_mapping = json.load(f)
-    for file in files:
-        topic_title = file.replace(book_dir, '')
-        topic_title = topic_title.replace('/', '｜')
-        topic_title = topic_title.replace('　', '.')
-        # topic_title = html.escape(topic_title)
-        topic_content = web_url + file
-        topic_content = "代码：<a href=\"{}\">{}</a>".format(
-            topic_content, topic_title)
-        print(topic_title)
-        send_topic_request_param = {
-            "type": "long_text",
-            "cateId": 20967,
-            "content": topic_content,
-            "topicTitle": topic_title,
-            "mdContent": topic_content,
-            "communityId": 3823,
-            "loginUserName": "BBS_Assistant",
-            "bizNo": "ebook"
-        }
-        if chapter_code_mapping.get(file) is None:
-            resp = post(request_url, send_topic_request_param)
-            topic_link = resp['data']['content']['url']
-            chapter_code_mapping[file] = topic_link
-            print('{}:{}'.format(file, topic_link))
-            save_mapping = json.dumps(chapter_code_mapping,
-                                      ensure_ascii=False,
-                                      indent=2)
-            with open(mapping_path, 'w') as f:
-                f.write(save_mapping)
-        else:
-            send_topic_request_param['id'] = int(
-                chapter_code_mapping[file].split('/')[-1])
-            resp = post(request_url, send_topic_request_param)
-            print('{}:{}'.format(file, chapter_code_mapping.get(file)))
--- a/src/ebook/ebook_get_request.py
+++ b/src/ebook/ebook_get_request.py
-import json
-import requests
-import logging
-logger = logging.getLogger(__name__)
-def get_chapter_content(params):
-    url = 'http://192.168.50.117:9003/v1/chapter/content'
-    headers = {
-        "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
-    }
-    result = requests.get(url=url, params=params, headers=headers)
-    if result.status_code == 200:
-        ret = json.loads(result.text)
-        logger.info('request success')
-        content = ret['data']
-        return content
-    else:
-        logger.info('request failed！！！！！')
-        return {}
-def get_chapter_list(params):
-    url = 'http://192.168.50.117:9003/inner/v1/chapter/list'
-    headers = {
-        "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
-    }
-    result = requests.get(url=url, params=params, headers=headers)
-    if result.status_code == 200:
-        ret = json.loads(result.text)
-        logger.info('request success')
-        content = ret['data']
-        return content
-    else:
-        logger.info('request failed！！！！！')
-        return {}
\ No newline at end of file
--- a/src/ebook/extract_book_code.py
+++ b/src/ebook/extract_book_code.py
-import json
-import os
-import re
-import html
-from bs4 import BeautifulSoup
-from .get_book_chapter_id_list import get_chapter_id_list
-from .ebook_get_request import get_chapter_content
-def extract_code(book_mapping):
-    # book_mapping_path = "data/book_mapping.json"
-    # with open(book_mapping_path, "r") as f:
-    #     book_mapping = json.load(f)
-    for book_idx, book_name in enumerate(book_mapping.keys()):
-        book_dir_name = book_name
-        book_dir = os.path.join('data', book_dir_name)
-        if not os.path.exists(book_dir):
-            os.mkdir(book_dir)
-        # print(book_dir_name)
-        book_id = book_mapping[book_name]
-        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
-        chapter_id_list = get_chapter_id_list(
-            request_get_chapter_id_list_params)
-        print(chapter_id_list)
-        for chapter_id in chapter_id_list:
-            print('当前章节id: {}'.format(chapter_id))
-            request_get_chapter_content_params = {
-                'bookId': book_id,
-                'chapterId': chapter_id
-            }
-            chapter_resp = get_chapter_content(
-                request_get_chapter_content_params)
-            chapter_name = chapter_resp['name']
-            chapter_content = chapter_resp['content']
-            try:
-                if book_name == "零基础学机器学习":
-                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
-                    chapter_name_modify = re.sub(
-                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
-                        chapter_name)
-                else:
-                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
-                    chapter_name_modify = re.sub(
-                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
-                        chapter_name)
-                chapter_name = chapter_name_modify
-                print(chapter_name)
-            except:
-                print('该章节没有章节序号： {}'.format(chapter_name))
-                pass
-            chapter_dir = os.path.join(book_dir, chapter_name)
-            if not os.path.exists(chapter_dir):
-                os.mkdir(chapter_dir)
-                # print('创建文件夹： {}'.format(chapter_dir))
-            chapter_content = html.unescape(chapter_content)
-            # print(chapter_content)
-            section_list = re.findall(r'<h2.*?>(.*?)</h2>',
-                                      chapter_content,
-                                      flags=re.S)
-            print(section_list)
-            section_content_list = re.split(r'<h2.*?>.*?</h2>',
-                                            chapter_content,
-                                            flags=re.S)
-            section_dir_list = []
-            for idx, section in enumerate(section_list):
-                section = section.replace('　', ' ')
-                if section.find(r'/') != -1:
-                    section = section.replace('/', '')
-                section_dir = os.path.join(chapter_dir,
-                                           '{}.{}'.format(idx + 1, section))
-                print(section_dir)
-                if not os.path.exists(section_dir):
-                    os.mkdir(section_dir)
-                section_dir_list.append(section_dir)
-            for idx, section_content in enumerate(section_content_list):
-                if idx == 0:
-                    html_save_path = os.path.join(chapter_dir, 'text.html')
-                else:
-                    html_save_path = os.path.join(section_dir_list[idx - 1],
-                                                  'text.html')
-                # with open(html_save_path, 'w', encoding='utf-8') as f:
-                #     f.write(section_content)
-                code_list = re.findall(r'<code>(.*?)</code>', section_content,
-                                       re.S)
-                res_codelist = []
-                for code in code_list:
-                    code = code.strip()
-                    if code != '':
-                        res_codelist.append(code)
-                # print(res_codelist)
-                # break
-                count = 0
-                for code in res_codelist:
-                    if len(code.split('\n')) < 2:
-                        continue
-                    # code = html.unescape(code)
-                    # soup = BeautifulSoup(code)
-                    # clean_code = soup.get_text()
-                    # print(clean_code)
-                    # print('-------' * 10)
-                    # pianduan_name = re.findall(r'(代码片段.*)，', clean_code)
-                    # if pianduan_name == []:
-                    #     pianduan_name_str = ''
-                    # else:
-                    #     pianduan_name_str = pianduan_name[0]
-                    # file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
-                    # print(file_name_list)
-                    # if file_name_list == []:
-                    #     file_name = '.txt'
-                    # else:
-                    #     file_name = file_name_list[0]
-                    # file_name = file_name.replace('/', '-')
-                    # save_file_name = pianduan_name_str + '-' + file_name
-                    # print(save_file_name)
-                    if idx == 0:
-                        code_save_path = os.path.join(chapter_dir,
-                                                      'code_0.css')
-                    else:
-                        count += 1
-                        code_save_path = os.path.join(
-                            section_dir_list[idx - 1],
-                            'code_{}.css'.format(count))
-                    # res_code_list = []
-                    # for line in clean_code.split('\n'):
-                    #     if line.find('文件名') != -1 or line.find(
-                    #             '代码片段') != -1 or line == '':
-                    #         continue
-                    #     clean_line = re.findall(r'^\d{1,5}\: *(.*)',
-                    #                             line)[0]
-                    #     res_code_list.append(clean_line)
-                    # res_code = '\n'.join(res_code_list)
-                    with open(code_save_path, 'w', encoding='utf-8') as f:
-                        f.write(code)
-                # clean_text_list = []
-                # for line in res_str.split('\n'):
-                #     if line == '':
-                #         continue
-                #     if line[0].isdigit():
-                #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
-                #                           line)[0]
-                #         # print(line)
-                #     else:
-                #         if line.startswith('>>'):
-                #             break
-                #     clean_text_list.append(line)
-                # clean_code = '\n'.join(clean_text_list)
-                # print(clean_code)
--- a/src/ebook/get_book_chapter_id_list.py
+++ b/src/ebook/get_book_chapter_id_list.py
-import json
-import re
-import html
-import nltk
-import html2text
-import os
-import pandas as pd
-from bs4 import BeautifulSoup
-from .ebook_get_request import get_chapter_list
-def get_chapter_id_list(param):
-    chapter_list = []
-    ret = get_chapter_list(param)
-    for item in ret:
-        chapterid = item['chapterid']
-        chapter_list.append(chapterid)
-    return chapter_list