diff --git a/douban_top.py b/douban_top.py new file mode 100644 index 0000000000000000000000000000000000000000..d44eaf50235e0a363cd86803d4cef53a2e103709 --- /dev/null +++ b/douban_top.py @@ -0,0 +1,44 @@ +import requests +import re + +# 抓取豆瓣电影TOP250的电影名称、导演、上映年份、电影评分以及评分人数等信息并保存到csv文件中 + +def main(): + headers = { + 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36" + } + + # step1 提取PageSource + f = open("douban_TOP.csv", encoding="utf-8", mode="w") + f.write("电影名称,导演,上映年份,电影评分,评分人数\n") # 写入表头 + for page in range(1, 11): + url = f"https://movie.douban.com/top250?start={25 * (page - 1)}" + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" # 确定编码格式 + PageSource = resp.text + + # step2 编写正则表达式 + # re.S可以让正则中的.匹配换行符 + obj = re.compile( + r'
.*?(?P.*?).*?

.*?导演: (?P.*?) .*?
(?P.*?) .*?(?P.*?).*?(?P.*?)人评价', + re.S) + + # step3 提取数据 + content = obj.finditer(PageSource) + for item in content: + name = item.group("name") + director = item.group("director").strip() + year = item.group("year") + score = item.group("score") + num = item.group("num") + f.write(f"{name},{director},{year},{score},{num}\n") # step4 存储数据 + + print(f"已经爬起了第{page}页") + + # step5 收尾工作 + f.close() + resp.close() + +if __name__ == '__main__': + main() +