diff --git "a/NO16/\345\216\237\347\245\236\346\225\260\346\215\256\351\207\207\351\233\206.py" "b/NO16/\345\216\237\347\245\236\346\225\260\346\215\256\351\207\207\351\233\206.py" new file mode 100644 index 0000000000000000000000000000000000000000..4c3a4a31d06939055a412c6db50d3905456f7aa1 --- /dev/null +++ "b/NO16/\345\216\237\347\245\236\346\225\260\346\215\256\351\207\207\351\233\206.py" @@ -0,0 +1,147 @@ +import requests +import sys +import random +import re +import sys +import os + + +def get_headers(): + uas = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + ua = random.choice(uas) + headers = { + "user-agent": ua, + "referer": "https://www.baidu.com" + } + return headers + + +# 数据提取函数 +def format_text(text): + # element = etree.HTML(text) + # print(text) + # all_li = element.cssselect("li.swiper-slide") + # print(all_li) + try: + div_pattern = re.compile('charList:(\[.*?),mod3Index') + match = div_pattern.search(text) + div_html = match.group(1) + title_pattern = re.compile('title:"(.*?)"') + cover1_pattern = re.compile('cover1:"(.*?)"') + cover2_pattern = re.compile('cover2:"(.*?)"') + icon_pattern = re.compile('icon:"(.*?)"') + # 该正则表达式比较复杂,需要重点理解 + cv_pattern = re.compile( + 'cv:\[\{name:[\"]?(?P.*?)[\"]?\,audio:\[(?P.*?)\]\}\,\{name:[\"]?(?P.*?)[\"]?\,audio:\[(?P.*?)\]\}\]\}') + titles = title_pattern.findall(div_html) + cover1s = cover1_pattern.findall(div_html) + cover2s = cover2_pattern.findall(div_html) + icons = icon_pattern.findall(div_html) + cvs = cv_pattern.findall(div_html) + print(cvs) + # print(titles,cover1s,cover2s,icons,cvs) + for index in range(0, len(titles)): + my_dict = { + "title": titles[index], + "cover1": cover1s[index], + "cover2": cover2s[index], + "icon": icons[index], + "cn_name": cvs[index][0], # 中文配音名称 + "jp_name": cvs[index][2], # 日文配音名称 + "cn_audios": cvs[index][1].split(","), + "jp_audios": cvs[index][3].split(","), + } + save(my_dict) + except Exception as e: + print("格式化数据异常",e) + # print("https:\\u002F\\u002Fuploadstatic.mihoyo.com\\u002Fcontentweb\\u002F20210508\\u002F2021050818254152089.png".encode('utf-8').decode("unicode-escape")) + # print(len(titles),len(cover1s),len(cover2s)) + + +# 创建文件夹 +def save(my_dict): + is_exists = os.path.exists("./download") + + # 判断结果 + if not is_exists: + os.mkdir("./download") + + # 提取数据 + + title = my_dict["title"] + cover1 = my_dict["cover1"] + cover2 = my_dict["cover2"] + icon = my_dict["icon"] + cn_name = my_dict["cn_name"] + jp_name = my_dict["jp_name"] + cn_audios = my_dict["cn_audios"] + jp_audios = my_dict["jp_audios"] + + # 创建目录 + if not os.path.exists(f"./download/{title}"): + os.mkdir(f"./download/{title}") + # 保存封面图1 + save_img(cover1, title, "cover1") + # 保存封面图2 + save_img(cover2, title, "cover2") + # 保存大头贴 + save_img(icon, title, "icon") + save_audio(title, cn_name, cn_audios) + save_audio(title, jp_name, jp_audios) + +def save_img(url, title, img_name): + # 去除 \u 字符 + url = url.encode('utf-8').decode("unicode-escape") + try: + res = requests.get(url, headers=get_headers(), timeout=5) + with open(f'./download/{title}/{img_name}.png', "wb") as f: + f.write(res.content) + except Exception as e: + print(e) + + +def save_audio(title, cn_name, cn_audios): + try: + for index in range(0, len(cn_audios)): + # 去除 \u 字符 + url = cn_audios[index].encode('utf-8').decode("unicode-escape") + # 去除 url 左右双引号 + url = url.strip('"') + + res = requests.get(url, headers=get_headers(), timeout=5) + with open(f'./download/{title}/{cn_name}_{index}.mp3', "wb") as f: + f.write(res.content) + except Exception as e: + print(e) + + +def run(url): + try: + res = requests.get(url, headers=get_headers(), timeout=5) + format_text(res.text) + except Exception as e: + print(url) + print("请求数据发生异常", e) + + +if __name__ == "__main__": + argvs = sys.argv + # 获取传递进来的参数 + category = argvs[1] + # category = 'liyue' + url = "https://ys.mihoyo.com/main/character/{}?char=0".format(category) + print(url) + run(url) diff --git "a/NO16/\347\220\264/cover1.png" "b/NO16/\347\220\264/cover1.png" new file mode 100644 index 0000000000000000000000000000000000000000..bc202acf4e70a83898425f07b8f214bfdbad3c9c Binary files /dev/null and "b/NO16/\347\220\264/cover1.png" differ diff --git "a/NO16/\347\220\264/cover2.png" "b/NO16/\347\220\264/cover2.png" new file mode 100644 index 0000000000000000000000000000000000000000..9177888df575e94b87a80778505cada423d11872 Binary files /dev/null and "b/NO16/\347\220\264/cover2.png" differ diff --git "a/NO16/\347\220\264/icon.png" "b/NO16/\347\220\264/icon.png" new file mode 100644 index 0000000000000000000000000000000000000000..0c61884e007a689f3385a061775b761b152423cc Binary files /dev/null and "b/NO16/\347\220\264/icon.png" differ diff --git "a/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_0.mp3" "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_0.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..08c8caa0d25bcc420f6df8fa55f70f616527a3aa Binary files /dev/null and "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_0.mp3" differ diff --git "a/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_1.mp3" "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_1.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..581a6b667494f6194a6d3c899b3f7aa64948331f Binary files /dev/null and "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_1.mp3" differ diff --git "a/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_2.mp3" "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_2.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..10aa0f705db7bc90341d9bb617399cf7f38aea4e Binary files /dev/null and "b/NO16/\347\220\264/\346\226\213\350\227\244\345\215\203\345\222\214_2.mp3" differ diff --git "a/NO16/\347\220\264/\346\236\227\347\260\214_0.mp3" "b/NO16/\347\220\264/\346\236\227\347\260\214_0.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..5b1e1de29fdf56d50f739b2dc59139a156b3593d Binary files /dev/null and "b/NO16/\347\220\264/\346\236\227\347\260\214_0.mp3" differ diff --git "a/NO16/\347\220\264/\346\236\227\347\260\214_1.mp3" "b/NO16/\347\220\264/\346\236\227\347\260\214_1.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..de513abdb4dc398ebb94f2833e28519e3c40b1af Binary files /dev/null and "b/NO16/\347\220\264/\346\236\227\347\260\214_1.mp3" differ diff --git "a/NO16/\347\220\264/\346\236\227\347\260\214_2.mp3" "b/NO16/\347\220\264/\346\236\227\347\260\214_2.mp3" new file mode 100644 index 0000000000000000000000000000000000000000..e6f4a1973e9b4e2dd2b5cde1bbb5f3accacf4225 Binary files /dev/null and "b/NO16/\347\220\264/\346\236\227\347\260\214_2.mp3" differ