diff --git a/douban_top.py b/douban_top.py new file mode 100644 index 0000000000000000000000000000000000000000..d44eaf50235e0a363cd86803d4cef53a2e103709 --- /dev/null +++ b/douban_top.py @@ -0,0 +1,44 @@ +import requests +import re + +# 抓取豆瓣电影TOP250的电影名称、导演、上映年份、电影评分以及评分人数等信息并保存到csv文件中 + +def main(): + headers = { + 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + } + + # step1 提取PageSource + f = open("douban_TOP.csv", encoding="utf-8", mode="w") + f.write("电影名称,导演,上映年份,电影评分,评分人数\n") # 写入表头 + for page in range(1, 11): + url = f"https://movie.douban.com/top250?start={25 * (page - 1)}" + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" # 确定编码格式 + PageSource = resp.text + + # step2 编写正则表达式 + # re.S可以让正则中的.匹配换行符 + obj = re.compile( + r'
.*?导演: (?P
(?P