From 8134a3de54fb3e7256b7330bd25b9b5848363a3a Mon Sep 17 00:00:00 2001 From: 6448aef831ba9538b4877d2b <6448aef831ba9538b4877d2b@devide> Date: Wed, 26 Apr 2023 04:58:25 +0000 Subject: [PATCH] Auto commit --- douban_top.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 douban_top.py diff --git a/douban_top.py b/douban_top.py new file mode 100644 index 0000000..d44eaf5 --- /dev/null +++ b/douban_top.py @@ -0,0 +1,44 @@ +import requests +import re + +# 抓取豆瓣电影TOP250的电影名称、导演、上映年份、电影评分以及评分人数等信息并保存到csv文件中 + +def main(): + headers = { + 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + } + + # step1 提取PageSource + f = open("douban_TOP.csv", encoding="utf-8", mode="w") + f.write("电影名称,导演,上映年份,电影评分,评分人数\n") # 写入表头 + for page in range(1, 11): + url = f"https://movie.douban.com/top250?start={25 * (page - 1)}" + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" # 确定编码格式 + PageSource = resp.text + + # step2 编写正则表达式 + # re.S可以让正则中的.匹配换行符 + obj = re.compile( + r'
.*?导演: (?P
(?P