From a62f993b5200aff0ac90fbf9e4515d4cadb9fc48 Mon Sep 17 00:00:00 2001 From: qq_44193969 <2608882093@qq.com> Date: Mon, 27 Dec 2021 16:51:34 +0800 Subject: [PATCH] modify ignore --- .gitignore | 5 +- ...347\254\2543\347\211\210\357\274\211.json" | 8 -- main.py | 9 -- src/ebook/community.py | 109 ---------------- src/ebook/ebook_get_request.py | 44 ------- src/ebook/extract_book_code.py | 123 ------------------ src/ebook/get_book_chapter_id_list.py | 18 --- 7 files changed, 4 insertions(+), 312 deletions(-) delete mode 100644 "data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" delete mode 100644 main.py delete mode 100644 src/ebook/community.py delete mode 100644 src/ebook/ebook_get_request.py delete mode 100644 src/ebook/extract_book_code.py delete mode 100644 src/ebook/get_book_chapter_id_list.py diff --git a/.gitignore b/.gitignore index ed8ebf5..5c1411d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -__pycache__ \ No newline at end of file +__pycache__ +src +main.py +data/全程软件测试(第3版).json \ No newline at end of file diff --git "a/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" "b/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" deleted file mode 100644 index a7e6863..0000000 --- "a/data/\345\205\250\347\250\213\350\275\257\344\273\266\346\265\213\350\257\225\357\274\210\347\254\2543\347\211\210\357\274\211.json" +++ /dev/null @@ -1,8 +0,0 @@ -{ - "data/全程软件测试(第3版)/第02章 全程测试:闪光的思想/2.2 测试驱动开发/code_1.java": "https://bbs.csdn.net/topics/603878157", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_3.java": "https://bbs.csdn.net/topics/603878239", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_2.java": "https://bbs.csdn.net/topics/603878240", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_5.java": "https://bbs.csdn.net/topics/603878090", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_4.java": "https://bbs.csdn.net/topics/603878158", - "data/全程软件测试(第3版)/第03章 准备:基础设施与TA框架/3.4 自动化测试框架/code_1.java": "https://bbs.csdn.net/topics/603878241" -} \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index 55f6df8..0000000 --- a/main.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.ebook.extract_book_code import extract_code -from src.ebook.community import send_topic - -if __name__ == "__main__": - extract_code() - web_url = 'https://gitcode.net/csdn/content/book_code_825acb73c85c4c4bb9632afe858bc097/-/tree/master/' - print('-------' * 20) - print('开始向社区发帖') - send_topic(web_url) \ No newline at end of file diff --git a/src/ebook/community.py b/src/ebook/community.py deleted file mode 100644 index 9e5bbe0..0000000 --- a/src/ebook/community.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import json -import html -import requests -import logging - -logger = logging.getLogger(__name__) - - -def get_files_path(file_dir, filetype='.txt'): - """得到文件夹下的所有.txt文件的路径 - Args: - file_dir: 文件夹路径 - filetype: 文件后缀 - Returns: - 所有filetype类型文件的绝对路径 - """ - files_path = [] - for root, dirs, files in os.walk(file_dir): - for file in files: - if filetype is None or (os.path.splitext(file)[1] == filetype): - files_path.append(os.path.join(root, file)) - return files_path - - -def post(url, params, retry=3, headers=None): - if headers is None: - hdrs = {"Content-Type": "application/json"} - else: - hdrs = headers - fails = 0 - while fails < retry: - try: - if headers is None: - data = json.dumps(params) - else: - data = params - logger.debug(f"will post {data} to {url}") - resp = requests.post(url, data, headers=hdrs, timeout=10) - if resp: - logger.info(f"resp {resp.content}") - return resp.json() - else: - logger.error(f"resp: [{resp}]") - fails += 1 - except Exception as error: - logger.error(f"post {params} to {url} failed {error}") - fails += 1 - if fails > retry: - raise error - - -def send_topic(web_url): - data_dir = 'data' - book_dir = 'data/全程软件测试(第3版)/' - # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/" - request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic' - - files = get_files_path('data/全程软件测试(第3版)', '.java') - mapping_path = 'data/全程软件测试(第3版).json' - - if not os.path.exists(mapping_path): - chapter_code_mapping = {} - save_mapping = json.dumps(chapter_code_mapping, - ensure_ascii=False, - indent=2) - with open(mapping_path, 'w') as f: - f.write(save_mapping) - - with open(mapping_path, 'r') as f: - chapter_code_mapping = json.load(f) - - for file in files: - topic_title = file.replace(book_dir, '') - topic_title = topic_title.replace('/', '|') - topic_title = topic_title.replace(' ', '.') - # topic_title = html.escape(topic_title) - topic_content = web_url + file - topic_content = "代码:{}".format( - topic_content, topic_title) - - print(topic_title) - - send_topic_request_param = { - "type": "long_text", - "cateId": 20965, - "content": topic_content, - "topicTitle": topic_title, - "mdContent": topic_content, - "communityId": 3822, - "loginUserName": "BBS_Assistant", - "bizNo": "ebook" - } - - if chapter_code_mapping.get(file) is None: - resp = post(request_url, send_topic_request_param) - topic_link = resp['data']['content']['url'] - chapter_code_mapping[file] = topic_link - print('{}:{}'.format(file, topic_link)) - save_mapping = json.dumps(chapter_code_mapping, - ensure_ascii=False, - indent=2) - with open(mapping_path, 'w') as f: - f.write(save_mapping) - else: - send_topic_request_param['id'] = int( - chapter_code_mapping[file].split('/')[-1]) - resp = post(request_url, send_topic_request_param) - print('{}:{}'.format(file, chapter_code_mapping.get(file))) diff --git a/src/ebook/ebook_get_request.py b/src/ebook/ebook_get_request.py deleted file mode 100644 index 7ae2ce9..0000000 --- a/src/ebook/ebook_get_request.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -import requests -import logging - - -logger = logging.getLogger(__name__) - - -def get_chapter_content(params): - url = 'http://192.168.50.117:9003/v1/chapter/content' - - headers = { - "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" - } - - result = requests.get(url=url, params=params, headers=headers) - - if result.status_code == 200: - ret = json.loads(result.text) - logger.info('request success') - content = ret['data'] - return content - else: - logger.info('request failed!!!!!') - return {} - - -def get_chapter_list(params): - url = 'http://192.168.50.117:9003/inner/v1/chapter/list' - - headers = { - "Cookie":"UserToken=149ba8a7a8d341bbbe41f904c4c9b176;UserName=xiuxiuyayayy" - } - - result = requests.get(url=url, params=params, headers=headers) - - if result.status_code == 200: - ret = json.loads(result.text) - logger.info('request success') - content = ret['data'] - return content - else: - logger.info('request failed!!!!!') - return {} \ No newline at end of file diff --git a/src/ebook/extract_book_code.py b/src/ebook/extract_book_code.py deleted file mode 100644 index 93573df..0000000 --- a/src/ebook/extract_book_code.py +++ /dev/null @@ -1,123 +0,0 @@ -import json -import os -import re -import html -from bs4 import BeautifulSoup -from .get_book_chapter_id_list import get_chapter_id_list -from .ebook_get_request import get_chapter_content - - -def extract_code(): - - # book_mapping_path = "data/book_mapping.json" - # with open(book_mapping_path, "r") as f: - # book_mapping = json.load(f) - book_mapping = { - "全程软件测试(第3版)": "825acb73c85c4c4bb9632afe858bc097", - } - for book_idx, book_name in enumerate(book_mapping.keys()): - book_dir_name = book_name - book_dir = os.path.join('data', book_dir_name) - if not os.path.exists(book_dir): - os.mkdir(book_dir) - print(book_dir_name) - book_id = book_mapping[book_name] - request_get_chapter_id_list_params = {"bookId": book_id, "is_main": 1} - chapter_id_list = get_chapter_id_list( - request_get_chapter_id_list_params) - print(chapter_id_list) - for chapter_id in chapter_id_list: - request_get_chapter_content_params = { - 'bookId': book_id, - 'chapterId': chapter_id - } - chapter_resp = get_chapter_content( - request_get_chapter_content_params) - chapter_name = chapter_resp['name'] - chapter_content = chapter_resp['content'] - try: - if book_name == "零基础学机器学习": - chapter_num = re.findall(r'第(.*)课', chapter_name)[0] - chapter_name_modify = re.sub( - r'第(.*)课', r'第{}课'.format(chapter_num.zfill(2)), - chapter_name) - else: - chapter_num = re.findall(r'第(.*)章', chapter_name)[0] - chapter_name_modify = re.sub( - r'第(.*)章', r'第{}章'.format(chapter_num.zfill(2)), - chapter_name) - chapter_name = chapter_name_modify - except: - # print('该章节没有章节序号: {}'.format(chapter_name)) - pass - chapter_dir = os.path.join(book_dir, chapter_name) - if not os.path.exists(chapter_dir): - os.mkdir(chapter_dir) - # print('创建文件夹: {}'.format(chapter_dir)) - - chapter_content = html.unescape(chapter_content) - - if book_name == "全程软件测试(第3版)": - section_list = re.findall(r'
(.*?)
',
- section_content, re.S)
-
- res_code_list = []
- count = 0
- for i in code_list:
- if len(i.split('\n')) < 2:
- continue
- count += 1
- i = html.unescape(i)
- soup = BeautifulSoup(i)
- res_str = soup.get_text()
-
- if idx == 0:
- code_save_dir = os.path.join(
- chapter_dir, 'code_0.java')
- else:
- code_save_dir = os.path.join(
- section_dir_list[idx - 1],
- 'code_{}.java'.format(count))
-
- print(code_save_dir)
- print(res_str)
- with open(code_save_dir, 'w', encoding='utf-8') as f:
- f.write(res_str)
-
- # clean_text_list = []
- # for line in res_str.split('\n'):
- # if line == '':
- # continue
- # if line[0].isdigit():
- # line = re.findall(r'^[0-9]+ {0,2}(.*)',
- # line)[0]
- # # print(line)
- # else:
- # if line.startswith('>>'):
- # break
- # clean_text_list.append(line)
- # clean_code = '\n'.join(clean_text_list)
- # print(clean_code)
\ No newline at end of file
diff --git a/src/ebook/get_book_chapter_id_list.py b/src/ebook/get_book_chapter_id_list.py
deleted file mode 100644
index 6fd1ad5..0000000
--- a/src/ebook/get_book_chapter_id_list.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import json
-import re
-import html
-import nltk
-import html2text
-import os
-import pandas as pd
-from bs4 import BeautifulSoup
-from .ebook_get_request import get_chapter_list
-
-
-def get_chapter_id_list(param):
- chapter_list = []
- ret = get_chapter_list(param)
- for item in ret:
- chapterid = item['chapterid']
- chapter_list.append(chapterid)
- return chapter_list
--
GitLab