From 8134a3de54fb3e7256b7330bd25b9b5848363a3a Mon Sep 17 00:00:00 2001 From: 6448aef831ba9538b4877d2b <6448aef831ba9538b4877d2b@devide> Date: Wed, 26 Apr 2023 04:58:25 +0000 Subject: [PATCH] Auto commit --- douban_top.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 douban_top.py diff --git a/douban_top.py b/douban_top.py new file mode 100644 index 0000000..d44eaf5 --- /dev/null +++ b/douban_top.py @@ -0,0 +1,44 @@ +import requests +import re + +# 抓取豆瓣电影TOP250的电影名称、导演、上映年份、电影评分以及评分人数等信息并保存到csv文件中 + +def main(): + headers = { + 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + } + + # step1 提取PageSource + f = open("douban_TOP.csv", encoding="utf-8", mode="w") + f.write("电影名称,导演,上映年份,电影评分,评分人数\n") # 写入表头 + for page in range(1, 11): + url = f"https://movie.douban.com/top250?start={25 * (page - 1)}" + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" # 确定编码格式 + PageSource = resp.text + + # step2 编写正则表达式 + # re.S可以让正则中的.匹配换行符 + obj = re.compile( + r'
.*?(?P.*?).*?

.*?导演: (?P.*?) .*?
(?P.*?) .*?(?P.*?).*?(?P.*?)人评价', + re.S) + + # step3 提取数据 + content = obj.finditer(PageSource) + for item in content: + name = item.group("name") + director = item.group("director").strip() + year = item.group("year") + score = item.group("score") + num = item.group("num") + f.write(f"{name},{director},{year},{score},{num}\n") # step4 存储数据 + + print(f"已经爬起了第{page}页") + + # step5 收尾工作 + f.close() + resp.close() + +if __name__ == '__main__': + main() + -- GitLab