diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ed8ebf583f771da9150c35db3955987b7d757904 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..06b3f6fdb4c3674025f94174e1a7ee9e4678e93a --- /dev/null +++ b/main.py @@ -0,0 +1,9 @@ +from src.ebook.extract_book_code import extract_code +from src.ebook.community import send_topic + +if __name__ == "__main__": + extract_code() + web_url = 'https://gitcode.net/csdn/content/book_code_08fd0c7025a4a34a97a29897b067d24/-/tree/master/' + print('-------' * 20) + print('开始向社区发帖') + send_topic(web_url) \ No newline at end of file diff --git a/src/ebook/community.py b/src/ebook/community.py new file mode 100644 index 0000000000000000000000000000000000000000..9e5bbe0798a10322857e0ec8c2b3a0f0a928dbea --- /dev/null +++ b/src/ebook/community.py @@ -0,0 +1,109 @@ +import os +import json +import html +import requests +import logging + +logger = logging.getLogger(__name__) + + +def get_files_path(file_dir, filetype='.txt'): + """得到文件夹下的所有.txt文件的路径 + Args: + file_dir: 文件夹路径 + filetype: 文件后缀 + Returns: + 所有filetype类型文件的绝对路径 + """ + files_path = [] + for root, dirs, files in os.walk(file_dir): + for file in files: + if filetype is None or (os.path.splitext(file)[1] == filetype): + files_path.append(os.path.join(root, file)) + return files_path + + +def post(url, params, retry=3, headers=None): + if headers is None: + hdrs = {"Content-Type": "application/json"} + else: + hdrs = headers + fails = 0 + while fails < retry: + try: + if headers is None: + data = json.dumps(params) + else: + data = params + logger.debug(f"will post {data} to {url}") + resp = requests.post(url, data, headers=hdrs, timeout=10) + if resp: + logger.info(f"resp {resp.content}") + return resp.json() + else: + logger.error(f"resp: [{resp}]") + fails += 1 + except Exception as error: + logger.error(f"post {params} to {url} failed {error}") + fails += 1 + if fails > retry: + raise error + + +def send_topic(web_url): + data_dir = 'data' + book_dir = 'data/全程软件测试(第3版)/' + # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/" + request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic' + + files = get_files_path('data/全程软件测试(第3版)', '.java') + mapping_path = 'data/全程软件测试(第3版).json' + + if not os.path.exists(mapping_path): + chapter_code_mapping = {} + save_mapping = json.dumps(chapter_code_mapping, + ensure_ascii=False, + indent=2) + with open(mapping_path, 'w') as f: + f.write(save_mapping) + + with open(mapping_path, 'r') as f: + chapter_code_mapping = json.load(f) + + for file in files: + topic_title = file.replace(book_dir, '') + topic_title = topic_title.replace('/', '|') + topic_title = topic_title.replace(' ', '.') + # topic_title = html.escape(topic_title) + topic_content = web_url + file + topic_content = "代码:{}".format( + topic_content, topic_title) + + print(topic_title) + + send_topic_request_param = { + "type": "long_text", + "cateId": 20965, + "content": topic_content, + "topicTitle": topic_title, + "mdContent": topic_content, + "communityId": 3822, + "loginUserName": "BBS_Assistant", + "bizNo": "ebook" + } + + if chapter_code_mapping.get(file) is None: + resp = post(request_url, send_topic_request_param) + topic_link = resp['data']['content']['url'] + chapter_code_mapping[file] = topic_link + print('{}:{}'.format(file, topic_link)) + save_mapping = json.dumps(chapter_code_mapping, + ensure_ascii=False, + indent=2) + with open(mapping_path, 'w') as f: + f.write(save_mapping) + else: + send_topic_request_param['id'] = int( + chapter_code_mapping[file].split('/')[-1]) + resp = post(request_url, send_topic_request_param) + print('{}:{}'.format(file, chapter_code_mapping.get(file))) diff --git a/src/ebook/ebook_get_request.py b/src/ebook/ebook_get_request.py new file mode 100644 index 0000000000000000000000000000000000000000..7ae2ce9d5ded35cacc1b6c20aa7a2bda04020d69 --- /dev/null +++ b/src/ebook/ebook_get_request.py @@ -0,0 +1,44 @@ +import json +import requests +import logging + + +logger = logging.getLogger(__name__) + + +def get_chapter_content(params): + url = 'http://192.168.50.117:9003/v1/chapter/content' + + headers = { + "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" + } + + result = requests.get(url=url, params=params, headers=headers) + + if result.status_code == 200: + ret = json.loads(result.text) + logger.info('request success') + content = ret['data'] + return content + else: + logger.info('request failed!!!!!') + return {} + + +def get_chapter_list(params): + url = 'http://192.168.50.117:9003/inner/v1/chapter/list' + + headers = { + "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" + } + + result = requests.get(url=url, params=params, headers=headers) + + if result.status_code == 200: + ret = json.loads(result.text) + logger.info('request success') + content = ret['data'] + return content + else: + logger.info('request failed!!!!!') + return {} \ No newline at end of file diff --git a/src/ebook/extract_book_code.py b/src/ebook/extract_book_code.py new file mode 100644 index 0000000000000000000000000000000000000000..6aa5c4d5a658f08c7df550ff214ab2390f9f66b9 --- /dev/null +++ b/src/ebook/extract_book_code.py @@ -0,0 +1,123 @@ +import json +import os +import re +import html +from bs4 import BeautifulSoup +from .get_book_chapter_id_list import get_chapter_id_list +from .ebook_get_request import get_chapter_content + + +def extract_code(): + + # book_mapping_path = "data/book_mapping.json" + # with open(book_mapping_path, "r") as f: + # book_mapping = json.load(f) + book_mapping = { + "深入剖析Nginx": "08fd0c7025a4a34a97a29897b067d24", + } + for book_idx, book_name in enumerate(book_mapping.keys()): + book_dir_name = book_name + book_dir = os.path.join('data', book_dir_name) + if not os.path.exists(book_dir): + os.mkdir(book_dir) + print(book_dir_name) + book_id = book_mapping[book_name] + request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} + chapter_id_list = get_chapter_id_list( + request_get_chapter_id_list_params) + print(chapter_id_list) + for chapter_id in chapter_id_list: + request_get_chapter_content_params = { + 'bookId': book_id, + 'chapterId': chapter_id + } + chapter_resp = get_chapter_content( + request_get_chapter_content_params) + chapter_name = chapter_resp['name'] + chapter_content = chapter_resp['content'] + try: + if book_name == "零基础学机器学习": + chapter_num = re.findall(r'第(.*)课', chapter_name)[0] + chapter_name_modify = re.sub( + r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), + chapter_name) + else: + chapter_num = re.findall(r'第(.*)章', chapter_name)[0] + chapter_name_modify = re.sub( + r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), + chapter_name) + chapter_name = chapter_name_modify + except: + # print('该章节没有章节序号: {}'.format(chapter_name)) + pass + chapter_dir = os.path.join(book_dir, chapter_name) + if not os.path.exists(chapter_dir): + os.mkdir(chapter_dir) + # print('创建文件夹: {}'.format(chapter_dir)) + + chapter_content = html.unescape(chapter_content) + + if book_name == "深入剖析Nginx": + section_list = re.findall(r'(.*?)', chapter_content) + section_content_list = re.split(r'.*?', + chapter_content, + flags=re.S) + section_dir_list = [] + for section in section_list: + section = section.replace(' ', ' ') + if section.find(r'/') != -1: + section = section.replace('/', '') + section_dir = os.path.join(chapter_dir, section) + # print(section_dir) + if not os.path.exists(section_dir): + os.mkdir(section_dir) + section_dir_list.append(section_dir) + for idx, section_content in enumerate(section_content_list): + if idx == 0: + html_save_path = os.path.join(chapter_dir, 'text.html') + else: + html_save_path = os.path.join( + section_dir_list[idx - 1], 'text.html') + # with open(html_save_path, 'w', encoding='utf-8') as f: + # f.write(section_content) + + code_list = re.findall(r'(.*?)', + section_content, re.S) + + res_code_list = [] + count = 0 + for i in code_list: + if len(i.split('\n')) < 2: + continue + count += 1 + i = html.unescape(i) + soup = BeautifulSoup(i) + res_str = soup.get_text() + + if idx == 0: + code_save_dir = os.path.join( + chapter_dir, 'code_0.java') + else: + code_save_dir = os.path.join( + section_dir_list[idx - 1], + 'code_{}.java'.format(count)) + + print(code_save_dir) + print(res_str) + with open(code_save_dir, 'w', encoding='utf-8') as f: + f.write(res_str) + + # clean_text_list = [] + # for line in res_str.split('\n'): + # if line == '': + # continue + # if line[0].isdigit(): + # line = re.findall(r'^[0-9]+ {0,2}(.*)', + # line)[0] + # # print(line) + # else: + # if line.startswith('>>'): + # break + # clean_text_list.append(line) + # clean_code = '\n'.join(clean_text_list) + # print(clean_code) \ No newline at end of file diff --git a/src/ebook/get_book_chapter_id_list.py b/src/ebook/get_book_chapter_id_list.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd1ad532de8862b3e2121fa08d35ef6d5ef3d74 --- /dev/null +++ b/src/ebook/get_book_chapter_id_list.py @@ -0,0 +1,18 @@ +import json +import re +import html +import nltk +import html2text +import os +import pandas as pd +from bs4 import BeautifulSoup +from .ebook_get_request import get_chapter_list + + +def get_chapter_id_list(param): + chapter_list = [] + ret = get_chapter_list(param) + for item in ret: + chapterid = item['chapterid'] + chapter_list.append(chapterid) + return chapter_list