diff --git a/tools/link_detection/README_CN.md b/tools/link_detection/README_CN.md new file mode 100644 index 0000000000000000000000000000000000000000..58ab4f8d238ee6be7c420320d17c41a82ab100f1 --- /dev/null +++ b/tools/link_detection/README_CN.md @@ -0,0 +1,31 @@ +# 链接检查工具 + +## 简介 + +此工具可以检查用户指定目录里所有文件的链接,将所有链接分为三类,并且将检查结果分别写入三个文件,如下所示: +1. 响应的状态码不是200的链接,写入`400.txt`文件中。 +2. 脚本执行过程中请求出现异常的链接,写入`exception.txt`文件中。 +3. 对于安装包的链接,因为请求非常耗时,所以不发请求,直接写入`slow.txt`文件中。 + + +## 使用说明 + +该工具所依赖的操作系统为Windows操作系统,执行环境为Python环境,具体使用步骤如下所示: + +1. 打开Git Bash,下载MindSpore Docs仓代码。 + ``` + git clone https://gitee.com/mindspore/docs.git + ``` +2. 进入`tools/link_detection`目录,安装执行所需的第三方库。 + ``` + cd tools/link_detection + pip install requests + ``` +3. 在`link_check`目录下执行如下命令,在输入需要检测目录的绝对路径后,开始进行检测,完成后会在当前目录下新建`404.txt`、`exception.txt`、`slow.txt`三个文件。 + ``` + python link_detection.py + ``` + > 检测目录的绝对路径全使用英文,并且使用Linux的绝对路径方式,例如:`/d/master/docs`。 + + + diff --git a/tools/link_detection/link_detection.py b/tools/link_detection/link_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..b2ff4f5cc344244b8cf2d2512dc6286e9881fea3 --- /dev/null +++ b/tools/link_detection/link_detection.py @@ -0,0 +1,101 @@ +import subprocess +import re +import requests +import urllib3 +from concurrent.futures import ThreadPoolExecutor +from threading import Lock + +def get_all_file(check_path): + ''' + get all the files in the directory. + ''' + cmd = 'find %s -type f' %check_path + res = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + all_file_list = res.stdout.read().decode('utf-8').split('\n') + del all_file_list[-1] + return all_file_list + +def get_all_link(all_file_list): + ''' + get all the links in all the files. + ''' + re_rule = "(https:\/\/)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" + for i in all_file_list: + i = i.split('/', 1)[1].replace('/', ':/', 1) + try: + with open(i, 'r', encoding='utf-8') as f: + data = f.read() + link_list = [] + urls = re.findall(re_rule, data, re.S) + if urls: + for url in urls: + link_list.append(url[0]+url[1]) + if link_list: + dic[i] = link_list + except Exception: + continue + +def get_status(addr): + ''' + Request the link and write different results to different files. + ''' + try: + link_path, link_addr, file_404, file_exception, mutexA, mutexB = addr[0], addr[1], addr[2], addr[3], addr[4], addr[5] + response = requests.get(link_addr, headers=headers, verify=False, timeout=5) + print(link_addr) + print(response.status_code) + if response.status_code != 200: + mutexA.acquire() + file_404.write('链接所在路径: %s' %link_path) + file_404.write('\n') + file_404.write('链接地址:%s' %link_addr) + file_404.write('\n') + file_404.write('链接的状态码:%s' %response.status_code) + file_404.write('\n\n\n\n\n') + mutexA.release() + except Exception : + print('exception!') + mutexB.acquire() + file_exception.write('链接所在路径: %s' %link_path) + file_exception.write('\n') + file_exception.write('链接地址:%s' %link_addr) + file_exception.write('\n\n\n\n\n') + mutexB.release() + +def multi_threading(): + ''' + open multithreading to finish tasks concurrently, do not send a request to the download link, write it directly. + ''' + for i in dic: + link_list = list(set(dic[i])) + for j in link_list: + if j.endswith('.whl') or j.endswith('.gz'): + f3.write('链接所在路径: %s' %i) + f3.write('\n') + f3.write('链接地址:%s' %j) + f3.write('\n\n\n\n\n') + continue + pool.submit(get_status, (i, j, f1, f2, mutexA, mutexB)) + pool.shutdown() + f1.close() + f2.close() + f3.close() + +def main(): + all_file_list = get_all_file(check_path) + get_all_link(all_file_list) + multi_threading() + +if __name__ == '__main__': + check_path = input('请输入您要检测的绝对路径:').strip() + dic = {} + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'} + pool = ThreadPoolExecutor(500) + f1 = open('./404.txt', 'w', encoding='utf-8') + f2 = open('./exception.txt', 'w', encoding='utf-8') + f3 = open('./slow.txt', 'w', encoding='utf-8') + mutexA = Lock() + mutexB = Lock() + main()