From 68974c87cef9ba775242c8a5a2dd2028589d01f7 Mon Sep 17 00:00:00 2001
From: PeasantWorker <2608882093@qq.com>
Date: Fri, 24 Dec 2021 10:34:55 +0800
Subject: [PATCH] init code

---
 .gitignore                            |   1 +
 main.py                               |  12 ++
 src/ebook/community.py                | 128 +++++++++++++++++++++
 src/ebook/ebook_get_request.py        |  44 ++++++++
 src/ebook/extract_book_code.py        | 154 ++++++++++++++++++++++++++
 src/ebook/get_book_chapter_id_list.py |  18 +++
 6 files changed, 357 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 main.py
 create mode 100644 src/ebook/community.py
 create mode 100644 src/ebook/ebook_get_request.py
 create mode 100644 src/ebook/extract_book_code.py
 create mode 100644 src/ebook/get_book_chapter_id_list.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ed8ebf5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..a2d21d6
--- /dev/null
+++ b/main.py
@@ -0,0 +1,12 @@
+from src.ebook.extract_book_code import extract_code
+from src.ebook.community import send_topic
+
+if __name__ == "__main__":
+    book_mapping = {
+        "前端体验设计": "c4eeb42b07f54b42a9fd1568b8ec4b98",
+    }
+    extract_code(book_mapping)
+    web_url = 'https://gitcode.net/csdn/content/book_id_c4eeb42b07f54b42a9fd1568b8ec4b98/-/tree/master/'
+    print('-------' * 20)
+    print('开始向社区发帖')
+    # send_topic(web_url)
\ No newline at end of file
diff --git a/src/ebook/community.py b/src/ebook/community.py
new file mode 100644
index 0000000..e0467f3
--- /dev/null
+++ b/src/ebook/community.py
@@ -0,0 +1,128 @@
+import os
+import json
+import html
+import requests
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def get_files_path(file_dir, filetype='.txt'):
+    """得到文件夹下的所有.txt文件的路径
+    Args:
+        file_dir: 文件夹路径
+        filetype: 文件后缀
+    Returns:
+        所有filetype类型文件的绝对路径
+    """
+    files_path = []
+    for root, dirs, files in os.walk(file_dir):
+        for file in files:
+            if filetype is None or (os.path.splitext(file)[1] == filetype):
+                files_path.append(os.path.join(root, file))
+    return files_path
+
+
+def get_all_files(current_address):
+    files = []
+    for parent, dirnames, filenames in os.walk(current_address):
+        # Case1: traversal the directories
+        # for dirname in dirnames:
+        #     print("Parent folder:", parent)
+        #     print("Dirname:", dirname)
+        # # Case2: traversal the files
+        for filename in filenames:
+            # print("Parent folder:", parent)
+            file_path = os.path.join(parent, filename)
+            files.append(file_path)
+    return files
+
+
+def post(url, params, retry=3, headers=None):
+    if headers is None:
+        hdrs = {"Content-Type": "application/json"}
+    else:
+        hdrs = headers
+    fails = 0
+    while fails < retry:
+        try:
+            if headers is None:
+                data = json.dumps(params)
+            else:
+                data = params
+            logger.debug(f"will post {data} to {url}")
+            resp = requests.post(url, data, headers=hdrs, timeout=10)
+            if resp:
+                logger.info(f"resp {resp.content}")
+                return resp.json()
+            else:
+                logger.error(f"resp: [{resp}]")
+                fails += 1
+        except Exception as error:
+            logger.error(f"post {params} to {url} failed {error}")
+            fails += 1
+            if fails > retry:
+                raise error
+
+
+def send_topic(web_url):
+    data_dir = 'data'
+    book_dir = 'data/深入剖析Nginx/'
+    # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
+    request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
+
+    # files = get_files_path('data/全程软件测试（第3版）', '.java')
+
+    files = get_all_files(book_dir)
+    print(files)
+
+    mapping_path = 'data/深入剖析Nginx.json'
+
+    if not os.path.exists(mapping_path):
+        chapter_code_mapping = {}
+        save_mapping = json.dumps(chapter_code_mapping,
+                                  ensure_ascii=False,
+                                  indent=2)
+        with open(mapping_path, 'w') as f:
+            f.write(save_mapping)
+
+    with open(mapping_path, 'r') as f:
+        chapter_code_mapping = json.load(f)
+
+    for file in files:
+        topic_title = file.replace(book_dir, '')
+        topic_title = topic_title.replace('/', '｜')
+        topic_title = topic_title.replace('　', '.')
+        # topic_title = html.escape(topic_title)
+        topic_content = web_url + file
+        topic_content = "代码：<a href=\"{}\">{}</a>".format(
+            topic_content, topic_title)
+
+        print(topic_title)
+
+        send_topic_request_param = {
+            "type": "long_text",
+            "cateId": 20966,
+            "content": topic_content,
+            "topicTitle": topic_title,
+            "mdContent": topic_content,
+            "communityId": 3821,
+            "loginUserName": "BBS_Assistant",
+            "bizNo": "ebook"
+        }
+
+        if chapter_code_mapping.get(file) is None:
+            resp = post(request_url, send_topic_request_param)
+            topic_link = resp['data']['content']['url']
+            chapter_code_mapping[file] = topic_link
+            print('{}:{}'.format(file, topic_link))
+            save_mapping = json.dumps(chapter_code_mapping,
+                                      ensure_ascii=False,
+                                      indent=2)
+            with open(mapping_path, 'w') as f:
+                f.write(save_mapping)
+        else:
+            send_topic_request_param['id'] = int(
+                chapter_code_mapping[file].split('/')[-1])
+            resp = post(request_url, send_topic_request_param)
+            print('{}:{}'.format(file, chapter_code_mapping.get(file)))
diff --git a/src/ebook/ebook_get_request.py b/src/ebook/ebook_get_request.py
new file mode 100644
index 0000000..7ae2ce9
--- /dev/null
+++ b/src/ebook/ebook_get_request.py
@@ -0,0 +1,44 @@
+import json
+import requests
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_chapter_content(params):
+    url = 'http://192.168.50.117:9003/v1/chapter/content'
+
+    headers = {
+        "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
+    }
+
+    result = requests.get(url=url, params=params, headers=headers)
+
+    if result.status_code == 200:
+        ret = json.loads(result.text)
+        logger.info('request success')
+        content = ret['data']
+        return content
+    else:
+        logger.info('request failed！！！！！')
+        return {}
+
+
+def get_chapter_list(params):
+    url = 'http://192.168.50.117:9003/inner/v1/chapter/list'
+
+    headers = {
+        "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy"
+    }
+
+    result = requests.get(url=url, params=params, headers=headers)
+
+    if result.status_code == 200:
+        ret = json.loads(result.text)
+        logger.info('request success')
+        content = ret['data']
+        return content
+    else:
+        logger.info('request failed！！！！！')
+        return {}
\ No newline at end of file
diff --git a/src/ebook/extract_book_code.py b/src/ebook/extract_book_code.py
new file mode 100644
index 0000000..533f0d5
--- /dev/null
+++ b/src/ebook/extract_book_code.py
@@ -0,0 +1,154 @@
+import json
+import os
+import re
+import html
+from bs4 import BeautifulSoup
+from .get_book_chapter_id_list import get_chapter_id_list
+from .ebook_get_request import get_chapter_content
+
+
+def extract_code(book_mapping):
+
+    # book_mapping_path = "data/book_mapping.json"
+    # with open(book_mapping_path, "r") as f:
+    #     book_mapping = json.load(f)
+    for book_idx, book_name in enumerate(book_mapping.keys()):
+        book_dir_name = book_name
+        book_dir = os.path.join('data', book_dir_name)
+        if not os.path.exists(book_dir):
+            os.mkdir(book_dir)
+        # print(book_dir_name)
+        book_id = book_mapping[book_name]
+        request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1}
+        chapter_id_list = get_chapter_id_list(
+            request_get_chapter_id_list_params)
+        # print(chapter_id_list)
+        for chapter_id in chapter_id_list:
+            request_get_chapter_content_params = {
+                'bookId': book_id,
+                'chapterId': chapter_id
+            }
+            chapter_resp = get_chapter_content(
+                request_get_chapter_content_params)
+            chapter_name = chapter_resp['name']
+            chapter_content = chapter_resp['content']
+            try:
+                if book_name == "零基础学机器学习":
+                    chapter_num = re.findall(r'第(.*)课', chapter_name)[0]
+                    chapter_name_modify = re.sub(
+                        r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)),
+                        chapter_name)
+                else:
+                    chapter_num = re.findall(r'第(.*)章', chapter_name)[0]
+                    chapter_name_modify = re.sub(
+                        r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)),
+                        chapter_name)
+                chapter_name = chapter_name_modify
+            except:
+                # print('该章节没有章节序号： {}'.format(chapter_name))
+                pass
+            chapter_dir = os.path.join(book_dir, chapter_name)
+            if not os.path.exists(chapter_dir):
+                os.mkdir(chapter_dir)
+                # print('创建文件夹： {}'.format(chapter_dir))
+
+            chapter_content = html.unescape(chapter_content)
+            # print(chapter_content)
+
+            if book_name == "前端体验设计":
+                section_list = re.findall(r'<h2.*?><a>(.*?)</a></h2>',
+                                          chapter_content,
+                                          flags=re.S)
+
+                # print(section_list)
+                section_content_list = re.split(r'<h2.*?>.*?</h2>',
+                                                chapter_content,
+                                                flags=re.S)
+                section_dir_list = []
+                for section in section_list:
+                    section = section.replace('　', ' ')
+                    if section.find(r'/') != -1:
+                        section = section.replace('/', '')
+                    section_dir = os.path.join(chapter_dir, section)
+                    # print(section_dir)
+                    if not os.path.exists(section_dir):
+                        os.mkdir(section_dir)
+                    section_dir_list.append(section_dir)
+                for idx, section_content in enumerate(section_content_list):
+                    if idx == 0:
+                        html_save_path = os.path.join(chapter_dir, 'text.html')
+                    else:
+                        html_save_path = os.path.join(
+                            section_dir_list[idx - 1], 'text.html')
+                    # with open(html_save_path, 'w', encoding='utf-8') as f:
+                    #     f.write(section_content)
+
+                    code_list = re.findall(
+                        r'(?:(?:<p class="left">\d{1,5}\:.*? \n).*?)*',
+                        section_content,
+                        flags=re.S)
+
+                    res_codelist = []
+                    for code in code_list:
+                        if code != '':
+                            res_codelist.append(code)
+                    # print(res_codelist)
+                    # break
+                    count = 0
+                    for code in res_codelist:
+                        code = html.unescape(code)
+                        soup = BeautifulSoup(code)
+                        clean_code = soup.get_text()
+                        print(clean_code)
+
+                        print('-------' * 10)
+                        pianduan_name = re.findall(r'(代码片段.*)，', clean_code)
+                        if pianduan_name == []:
+                            pianduan_name_str = ''
+                        else:
+                            pianduan_name_str = pianduan_name[0]
+                        file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
+                        print(file_name_list)
+                        if file_name_list == []:
+                            file_name = '.txt'
+                        else:
+                            file_name = file_name_list[0]
+                        file_name = file_name.replace('/', '-')
+                        save_file_name = pianduan_name_str + '-' + file_name
+                        # print(save_file_name)
+
+                        if idx == 0:
+                            code_save_path = os.path.join(
+                                chapter_dir, save_file_name)
+                        else:
+                            count += 1
+                            code_save_path = os.path.join(
+                                section_dir_list[idx - 1], save_file_name)
+
+                        res_code_list = []
+                        for line in clean_code.split('\n'):
+                            if line.find('文件名') != -1 or line.find(
+                                    '代码片段') != -1 or line == '':
+                                continue
+                            clean_line = re.findall(r'^\d{1,5}\: *(.*)',
+                                                    line)[0]
+                            res_code_list.append(clean_line)
+                        res_code = '\n'.join(res_code_list)
+
+                        with open(code_save_path, 'w', encoding='utf-8') as f:
+                            f.write(res_code)
+
+                    # clean_text_list = []
+                    # for line in res_str.split('\n'):
+                    #     if line == '':
+                    #         continue
+                    #     if line[0].isdigit():
+                    #         line = re.findall(r'^[0-9]+ {0,2}(.*)',
+                    #                           line)[0]
+                    #         # print(line)
+                    #     else:
+                    #         if line.startswith('>>'):
+                    #             break
+                    #     clean_text_list.append(line)
+                    # clean_code = '\n'.join(clean_text_list)
+                    # print(clean_code)
diff --git a/src/ebook/get_book_chapter_id_list.py b/src/ebook/get_book_chapter_id_list.py
new file mode 100644
index 0000000..6fd1ad5
--- /dev/null
+++ b/src/ebook/get_book_chapter_id_list.py
@@ -0,0 +1,18 @@
+import json
+import re
+import html
+import nltk
+import html2text
+import os
+import pandas as pd
+from bs4 import BeautifulSoup
+from .ebook_get_request import get_chapter_list
+
+
+def get_chapter_id_list(param):
+    chapter_list = []
+    ret = get_chapter_list(param)
+    for item in ret:
+        chapterid = item['chapterid']
+        chapter_list.append(chapterid)
+    return chapter_list
-- 
GitLab