From 154da4694439a0004676d8449ab9d0b7ea92ede5 Mon Sep 17 00:00:00 2001 From: PeasantWorker <2608882093@qq.com> Date: Fri, 24 Dec 2021 11:28:39 +0800 Subject: [PATCH] add code --- .gitignore | 1 + main.py | 14 +++ src/ebook/community.py | 125 ++++++++++++++++++++ src/ebook/ebook_get_request.py | 44 +++++++ src/ebook/extract_book_code.py | 158 ++++++++++++++++++++++++++ src/ebook/get_book_chapter_id_list.py | 18 +++ 6 files changed, 360 insertions(+) create mode 100644 .gitignore create mode 100644 main.py create mode 100644 src/ebook/community.py create mode 100644 src/ebook/ebook_get_request.py create mode 100644 src/ebook/extract_book_code.py create mode 100644 src/ebook/get_book_chapter_id_list.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..8bd6d0c --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +from src.ebook.extract_book_code import extract_code +from src.ebook.community import send_topic + +if __name__ == "__main__": + book_mapping = {"零基础学机器学习": "b88b00f6ad14402ea66695d6809614da"} + for key in book_mapping.keys(): + extract_code(book_mapping) + web_url = 'https://gitcode.net/csdn/content/book_code_{}/-/tree/master/'.format( + book_mapping[key]) + print('-------' * 20) + print('开始向社区发帖') + book_dir = 'data/{}/'.format(key) + mapping_path = 'data/{}.json'.format(key) + send_topic(web_url, book_dir, mapping_path) \ No newline at end of file diff --git a/src/ebook/community.py b/src/ebook/community.py new file mode 100644 index 0000000..85176e0 --- /dev/null +++ b/src/ebook/community.py @@ -0,0 +1,125 @@ +import os +import json +import html +import requests +import logging + +logger = logging.getLogger(__name__) + + +def get_files_path(file_dir, filetype='.txt'): + """得到文件夹下的所有.txt文件的路径 + Args: + file_dir: 文件夹路径 + filetype: 文件后缀 + Returns: + 所有filetype类型文件的绝对路径 + """ + files_path = [] + for root, dirs, files in os.walk(file_dir): + for file in files: + if filetype is None or (os.path.splitext(file)[1] == filetype): + files_path.append(os.path.join(root, file)) + return files_path + + +def get_all_files(current_address): + files = [] + for parent, dirnames, filenames in os.walk(current_address): + # Case1: traversal the directories + # for dirname in dirnames: + # print("Parent folder:", parent) + # print("Dirname:", dirname) + # # Case2: traversal the files + for filename in filenames: + # print("Parent folder:", parent) + file_path = os.path.join(parent, filename) + files.append(file_path) + return files + + +def post(url, params, retry=3, headers=None): + if headers is None: + hdrs = {"Content-Type": "application/json"} + else: + hdrs = headers + fails = 0 + while fails < retry: + try: + if headers is None: + data = json.dumps(params) + else: + data = params + logger.debug(f"will post {data} to {url}") + resp = requests.post(url, data, headers=hdrs, timeout=10) + if resp: + logger.info(f"resp {resp.content}") + return resp.json() + else: + logger.error(f"resp: [{resp}]") + fails += 1 + except Exception as error: + logger.error(f"post {params} to {url} failed {error}") + fails += 1 + if fails > retry: + raise error + + +def send_topic(web_url, book_dir, mapping_path): + data_dir = 'data' + # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/" + request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic' + + # files = get_files_path('data/全程软件测试(第3版)', '.java') + + files = get_all_files(book_dir) + print(files) + + if not os.path.exists(mapping_path): + chapter_code_mapping = {} + save_mapping = json.dumps(chapter_code_mapping, + ensure_ascii=False, + indent=2) + with open(mapping_path, 'w') as f: + f.write(save_mapping) + + with open(mapping_path, 'r') as f: + chapter_code_mapping = json.load(f) + + for file in files: + topic_title = file.replace(book_dir, '') + topic_title = topic_title.replace('/', '|') + topic_title = topic_title.replace(' ', '.') + # topic_title = html.escape(topic_title) + topic_content = web_url + file + topic_content = "代码:{}".format( + topic_content, topic_title) + + print(topic_title) + + send_topic_request_param = { + "type": "long_text", + "cateId": 20967, + "content": topic_content, + "topicTitle": topic_title, + "mdContent": topic_content, + "communityId": 3823, + "loginUserName": "BBS_Assistant", + "bizNo": "ebook" + } + + if chapter_code_mapping.get(file) is None: + resp = post(request_url, send_topic_request_param) + topic_link = resp['data']['content']['url'] + chapter_code_mapping[file] = topic_link + print('{}:{}'.format(file, topic_link)) + save_mapping = json.dumps(chapter_code_mapping, + ensure_ascii=False, + indent=2) + with open(mapping_path, 'w') as f: + f.write(save_mapping) + else: + send_topic_request_param['id'] = int( + chapter_code_mapping[file].split('/')[-1]) + resp = post(request_url, send_topic_request_param) + print('{}:{}'.format(file, chapter_code_mapping.get(file))) diff --git a/src/ebook/ebook_get_request.py b/src/ebook/ebook_get_request.py new file mode 100644 index 0000000..7ae2ce9 --- /dev/null +++ b/src/ebook/ebook_get_request.py @@ -0,0 +1,44 @@ +import json +import requests +import logging + + +logger = logging.getLogger(__name__) + + +def get_chapter_content(params): + url = 'http://192.168.50.117:9003/v1/chapter/content' + + headers = { + "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" + } + + result = requests.get(url=url, params=params, headers=headers) + + if result.status_code == 200: + ret = json.loads(result.text) + logger.info('request success') + content = ret['data'] + return content + else: + logger.info('request failed!!!!!') + return {} + + +def get_chapter_list(params): + url = 'http://192.168.50.117:9003/inner/v1/chapter/list' + + headers = { + "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" + } + + result = requests.get(url=url, params=params, headers=headers) + + if result.status_code == 200: + ret = json.loads(result.text) + logger.info('request success') + content = ret['data'] + return content + else: + logger.info('request failed!!!!!') + return {} \ No newline at end of file diff --git a/src/ebook/extract_book_code.py b/src/ebook/extract_book_code.py new file mode 100644 index 0000000..27e92fb --- /dev/null +++ b/src/ebook/extract_book_code.py @@ -0,0 +1,158 @@ +import json +import os +import re +import html +from bs4 import BeautifulSoup +from .get_book_chapter_id_list import get_chapter_id_list +from .ebook_get_request import get_chapter_content + + +def extract_code(book_mapping): + + # book_mapping_path = "data/book_mapping.json" + # with open(book_mapping_path, "r") as f: + # book_mapping = json.load(f) + for book_idx, book_name in enumerate(book_mapping.keys()): + book_dir_name = book_name + book_dir = os.path.join('data', book_dir_name) + if not os.path.exists(book_dir): + os.mkdir(book_dir) + # print(book_dir_name) + book_id = book_mapping[book_name] + request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} + chapter_id_list = get_chapter_id_list( + request_get_chapter_id_list_params) + print(chapter_id_list) + for chapter_id in chapter_id_list: + print('当前章节id: {}'.format(chapter_id)) + request_get_chapter_content_params = { + 'bookId': book_id, + 'chapterId': chapter_id + } + chapter_resp = get_chapter_content( + request_get_chapter_content_params) + chapter_name = chapter_resp['name'] + chapter_content = chapter_resp['content'] + try: + if book_name == "零基础学机器学习": + chapter_num = re.findall(r'第(.*)课', chapter_name)[0] + chapter_name_modify = re.sub( + r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), + chapter_name) + else: + chapter_num = re.findall(r'第(.*)章', chapter_name)[0] + chapter_name_modify = re.sub( + r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), + chapter_name) + chapter_name = chapter_name_modify + print(chapter_name) + except: + print('该章节没有章节序号: {}'.format(chapter_name)) + pass + chapter_dir = os.path.join(book_dir, chapter_name) + if not os.path.exists(chapter_dir): + os.mkdir(chapter_dir) + # print('创建文件夹: {}'.format(chapter_dir)) + + chapter_content = html.unescape(chapter_content) + # print(chapter_content) + + section_list = re.findall(r'(.*?)', + chapter_content, + flags=re.S) + + print(section_list) + section_content_list = re.split(r'.*?', + chapter_content, + flags=re.S) + section_dir_list = [] + for idx, section in enumerate(section_list): + section = section.replace(' ', ' ') + if section.find(r'/') != -1: + section = section.replace('/', '') + section_dir = os.path.join(chapter_dir, + '{}.{}'.format(idx + 1, section)) + print(section_dir) + if not os.path.exists(section_dir): + os.mkdir(section_dir) + section_dir_list.append(section_dir) + for idx, section_content in enumerate(section_content_list): + if idx == 0: + html_save_path = os.path.join(chapter_dir, 'text.html') + else: + html_save_path = os.path.join(section_dir_list[idx - 1], + 'text.html') + # with open(html_save_path, 'w', encoding='utf-8') as f: + # f.write(section_content) + + code_list = re.findall(r'(.*?)', section_content, + re.S) + + res_codelist = [] + for code in code_list: + code = code.strip() + if code != '': + res_codelist.append(code) + # print(res_codelist) + # break + count = 0 + for code in res_codelist: + if len(code.split('\n')) < 2: + continue + # code = html.unescape(code) + # soup = BeautifulSoup(code) + # clean_code = soup.get_text() + # print(clean_code) + + # print('-------' * 10) + # pianduan_name = re.findall(r'(代码片段.*),', clean_code) + # if pianduan_name == []: + # pianduan_name_str = '' + # else: + # pianduan_name_str = pianduan_name[0] + # file_name_list = re.findall(r'文件名: (.*)\n', clean_code) + # print(file_name_list) + # if file_name_list == []: + # file_name = '.txt' + # else: + # file_name = file_name_list[0] + # file_name = file_name.replace('/', '-') + # save_file_name = pianduan_name_str + '-' + file_name + # print(save_file_name) + + if idx == 0: + code_save_path = os.path.join(chapter_dir, + 'code_0.css') + else: + count += 1 + code_save_path = os.path.join( + section_dir_list[idx - 1], + 'code_{}.css'.format(count)) + + # res_code_list = [] + # for line in clean_code.split('\n'): + # if line.find('文件名') != -1 or line.find( + # '代码片段') != -1 or line == '': + # continue + # clean_line = re.findall(r'^\d{1,5}\: *(.*)', + # line)[0] + # res_code_list.append(clean_line) + # res_code = '\n'.join(res_code_list) + + with open(code_save_path, 'w', encoding='utf-8') as f: + f.write(code) + + # clean_text_list = [] + # for line in res_str.split('\n'): + # if line == '': + # continue + # if line[0].isdigit(): + # line = re.findall(r'^[0-9]+ {0,2}(.*)', + # line)[0] + # # print(line) + # else: + # if line.startswith('>>'): + # break + # clean_text_list.append(line) + # clean_code = '\n'.join(clean_text_list) + # print(clean_code) diff --git a/src/ebook/get_book_chapter_id_list.py b/src/ebook/get_book_chapter_id_list.py new file mode 100644 index 0000000..6fd1ad5 --- /dev/null +++ b/src/ebook/get_book_chapter_id_list.py @@ -0,0 +1,18 @@ +import json +import re +import html +import nltk +import html2text +import os +import pandas as pd +from bs4 import BeautifulSoup +from .ebook_get_request import get_chapter_list + + +def get_chapter_id_list(param): + chapter_list = [] + ret = get_chapter_list(param) + for item in ret: + chapterid = item['chapterid'] + chapter_list.append(chapterid) + return chapter_list -- GitLab