From a62f993b5200aff0ac90fbf9e4515d4cadb9fc48 Mon Sep 17 00:00:00 2001 From: qq_44193969 <2608882093@qq.com> Date: Mon, 27 Dec 2021 16:51:34 +0800 Subject: [PATCH] modify ignore --- .gitignore | 5 +- ...347\254\2543\347\211\210\357\274\211.json" | 8 -- main.py | 9 -- src/ebook/community.py | 109 ---------------- src/ebook/ebook_get_request.py | 44 ------- src/ebook/extract_book_code.py | 123 ------------------ src/ebook/get_book_chapter_id_list.py | 18 --- 7 files changed, 4 insertions(+), 312 deletions(-) delete mode 100644 "data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" delete mode 100644 main.py delete mode 100644 src/ebook/community.py delete mode 100644 src/ebook/ebook_get_request.py delete mode 100644 src/ebook/extract_book_code.py delete mode 100644 src/ebook/get_book_chapter_id_list.py diff --git a/.gitignore b/.gitignore index ed8ebf5..5c1411d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -__pycache__ \ No newline at end of file +__pycache__ +src +main.py +data/全程软件测试(第3版).json \ No newline at end of file diff --git "a/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" "b/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" deleted file mode 100644 index a7e6863..0000000 --- "a/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" +++ /dev/null @@ -1,8 +0,0 @@ -{ - "data/全程软件测试(第3版)/第02章 全程测试:闪光的思想/2.2 测试驱动开发/code_1.java": "https://bbs.csdn.net/topics/603878157", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_3.java": "https://bbs.csdn.net/topics/603878239", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_2.java": "https://bbs.csdn.net/topics/603878240", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_5.java": "https://bbs.csdn.net/topics/603878090", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_4.java": "https://bbs.csdn.net/topics/603878158", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_1.java": "https://bbs.csdn.net/topics/603878241" -} \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index 55f6df8..0000000 --- a/main.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.ebook.extract_book_code import extract_code -from src.ebook.community import send_topic - -if __name__ == "__main__": - extract_code() - web_url = 'https://gitcode.net/csdn/content/book_code_825acb73c85c4c4bb9632afe858bc097/-/tree/master/' - print('-------' * 20) - print('开始向社区发帖') - send_topic(web_url) \ No newline at end of file diff --git a/src/ebook/community.py b/src/ebook/community.py deleted file mode 100644 index 9e5bbe0..0000000 --- a/src/ebook/community.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import json -import html -import requests -import logging - -logger = logging.getLogger(__name__) - - -def get_files_path(file_dir, filetype='.txt'): - """得到文件夹下的所有.txt文件的路径 - Args: - file_dir: 文件夹路径 - filetype: 文件后缀 - Returns: - 所有filetype类型文件的绝对路径 - """ - files_path = [] - for root, dirs, files in os.walk(file_dir): - for file in files: - if filetype is None or (os.path.splitext(file)[1] == filetype): - files_path.append(os.path.join(root, file)) - return files_path - - -def post(url, params, retry=3, headers=None): - if headers is None: - hdrs = {"Content-Type": "application/json"} - else: - hdrs = headers - fails = 0 - while fails < retry: - try: - if headers is None: - data = json.dumps(params) - else: - data = params - logger.debug(f"will post {data} to {url}") - resp = requests.post(url, data, headers=hdrs, timeout=10) - if resp: - logger.info(f"resp {resp.content}") - return resp.json() - else: - logger.error(f"resp: [{resp}]") - fails += 1 - except Exception as error: - logger.error(f"post {params} to {url} failed {error}") - fails += 1 - if fails > retry: - raise error - - -def send_topic(web_url): - data_dir = 'data' - book_dir = 'data/全程软件测试(第3版)/' - # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/" - request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic' - - files = get_files_path('data/全程软件测试(第3版)', '.java') - mapping_path = 'data/全程软件测试(第3版).json' - - if not os.path.exists(mapping_path): - chapter_code_mapping = {} - save_mapping = json.dumps(chapter_code_mapping, - ensure_ascii=False, - indent=2) - with open(mapping_path, 'w') as f: - f.write(save_mapping) - - with open(mapping_path, 'r') as f: - chapter_code_mapping = json.load(f) - - for file in files: - topic_title = file.replace(book_dir, '') - topic_title = topic_title.replace('/', '|') - topic_title = topic_title.replace(' ', '.') - # topic_title = html.escape(topic_title) - topic_content = web_url + file - topic_content = "代码:{}".format( - topic_content, topic_title) - - print(topic_title) - - send_topic_request_param = { - "type": "long_text", - "cateId": 20965, - "content": topic_content, - "topicTitle": topic_title, - "mdContent": topic_content, - "communityId": 3822, - "loginUserName": "BBS_Assistant", - "bizNo": "ebook" - } - - if chapter_code_mapping.get(file) is None: - resp = post(request_url, send_topic_request_param) - topic_link = resp['data']['content']['url'] - chapter_code_mapping[file] = topic_link - print('{}:{}'.format(file, topic_link)) - save_mapping = json.dumps(chapter_code_mapping, - ensure_ascii=False, - indent=2) - with open(mapping_path, 'w') as f: - f.write(save_mapping) - else: - send_topic_request_param['id'] = int( - chapter_code_mapping[file].split('/')[-1]) - resp = post(request_url, send_topic_request_param) - print('{}:{}'.format(file, chapter_code_mapping.get(file))) diff --git a/src/ebook/ebook_get_request.py b/src/ebook/ebook_get_request.py deleted file mode 100644 index 7ae2ce9..0000000 --- a/src/ebook/ebook_get_request.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -import requests -import logging - - -logger = logging.getLogger(__name__) - - -def get_chapter_content(params): - url = 'http://192.168.50.117:9003/v1/chapter/content' - - headers = { - "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" - } - - result = requests.get(url=url, params=params, headers=headers) - - if result.status_code == 200: - ret = json.loads(result.text) - logger.info('request success') - content = ret['data'] - return content - else: - logger.info('request failed!!!!!') - return {} - - -def get_chapter_list(params): - url = 'http://192.168.50.117:9003/inner/v1/chapter/list' - - headers = { - "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" - } - - result = requests.get(url=url, params=params, headers=headers) - - if result.status_code == 200: - ret = json.loads(result.text) - logger.info('request success') - content = ret['data'] - return content - else: - logger.info('request failed!!!!!') - return {} \ No newline at end of file diff --git a/src/ebook/extract_book_code.py b/src/ebook/extract_book_code.py deleted file mode 100644 index 93573df..0000000 --- a/src/ebook/extract_book_code.py +++ /dev/null @@ -1,123 +0,0 @@ -import json -import os -import re -import html -from bs4 import BeautifulSoup -from .get_book_chapter_id_list import get_chapter_id_list -from .ebook_get_request import get_chapter_content - - -def extract_code(): - - # book_mapping_path = "data/book_mapping.json" - # with open(book_mapping_path, "r") as f: - # book_mapping = json.load(f) - book_mapping = { - "全程软件测试(第3版)": "825acb73c85c4c4bb9632afe858bc097", - } - for book_idx, book_name in enumerate(book_mapping.keys()): - book_dir_name = book_name - book_dir = os.path.join('data', book_dir_name) - if not os.path.exists(book_dir): - os.mkdir(book_dir) - print(book_dir_name) - book_id = book_mapping[book_name] - request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} - chapter_id_list = get_chapter_id_list( - request_get_chapter_id_list_params) - print(chapter_id_list) - for chapter_id in chapter_id_list: - request_get_chapter_content_params = { - 'bookId': book_id, - 'chapterId': chapter_id - } - chapter_resp = get_chapter_content( - request_get_chapter_content_params) - chapter_name = chapter_resp['name'] - chapter_content = chapter_resp['content'] - try: - if book_name == "零基础学机器学习": - chapter_num = re.findall(r'第(.*)课', chapter_name)[0] - chapter_name_modify = re.sub( - r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), - chapter_name) - else: - chapter_num = re.findall(r'第(.*)章', chapter_name)[0] - chapter_name_modify = re.sub( - r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), - chapter_name) - chapter_name = chapter_name_modify - except: - # print('该章节没有章节序号: {}'.format(chapter_name)) - pass - chapter_dir = os.path.join(book_dir, chapter_name) - if not os.path.exists(chapter_dir): - os.mkdir(chapter_dir) - # print('创建文件夹: {}'.format(chapter_dir)) - - chapter_content = html.unescape(chapter_content) - - if book_name == "全程软件测试(第3版)": - section_list = re.findall(r'(.*?)', chapter_content) - section_content_list = re.split(r'.*?', - chapter_content, - flags=re.S) - section_dir_list = [] - for section in section_list: - section = section.replace(' ', ' ') - if section.find(r'/') != -1: - section = section.replace('/', '') - section_dir = os.path.join(chapter_dir, section) - # print(section_dir) - if not os.path.exists(section_dir): - os.mkdir(section_dir) - section_dir_list.append(section_dir) - for idx, section_content in enumerate(section_content_list): - if idx == 0: - html_save_path = os.path.join(chapter_dir, 'text.html') - else: - html_save_path = os.path.join( - section_dir_list[idx - 1], 'text.html') - # with open(html_save_path, 'w', encoding='utf-8') as f: - # f.write(section_content) - - code_list = re.findall(r'(.*?)', - section_content, re.S) - - res_code_list = [] - count = 0 - for i in code_list: - if len(i.split('\n')) < 2: - continue - count += 1 - i = html.unescape(i) - soup = BeautifulSoup(i) - res_str = soup.get_text() - - if idx == 0: - code_save_dir = os.path.join( - chapter_dir, 'code_0.java') - else: - code_save_dir = os.path.join( - section_dir_list[idx - 1], - 'code_{}.java'.format(count)) - - print(code_save_dir) - print(res_str) - with open(code_save_dir, 'w', encoding='utf-8') as f: - f.write(res_str) - - # clean_text_list = [] - # for line in res_str.split('\n'): - # if line == '': - # continue - # if line[0].isdigit(): - # line = re.findall(r'^[0-9]+ {0,2}(.*)', - # line)[0] - # # print(line) - # else: - # if line.startswith('>>'): - # break - # clean_text_list.append(line) - # clean_code = '\n'.join(clean_text_list) - # print(clean_code) \ No newline at end of file diff --git a/src/ebook/get_book_chapter_id_list.py b/src/ebook/get_book_chapter_id_list.py deleted file mode 100644 index 6fd1ad5..0000000 --- a/src/ebook/get_book_chapter_id_list.py +++ /dev/null @@ -1,18 +0,0 @@ -import json -import re -import html -import nltk -import html2text -import os -import pandas as pd -from bs4 import BeautifulSoup -from .ebook_get_request import get_chapter_list - - -def get_chapter_id_list(param): - chapter_list = [] - ret = get_chapter_list(param) - for item in ret: - chapterid = item['chapterid'] - chapter_list.append(chapterid) - return chapter_list -- GitLab