You need to sign in or sign up before continuing.

Auto commit

上级 df352508
import requests
import re
# 抓取豆瓣电影TOP250的电影名称、导演、上映年份、电影评分以及评分人数等信息并保存到csv文件中
def main():
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
# step1 提取PageSource
f = open("douban_TOP.csv", encoding="utf-8", mode="w")
f.write("电影名称,导演,上映年份,电影评分,评分人数\n") # 写入表头
for page in range(1, 11):
url = f"https://movie.douban.com/top250?start={25 * (page - 1)}"
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8" # 确定编码格式
PageSource = resp.text
# step2 编写正则表达式
# re.S可以让正则中的.匹配换行符
obj = re.compile(
r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<p class="">.*?导演: (?P<director>.*?)&nbsp.*?<br>(?P<year>.*?)&nbsp.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<num>.*?)人评价</span>',
re.S)
# step3 提取数据
content = obj.finditer(PageSource)
for item in content:
name = item.group("name")
director = item.group("director").strip()
year = item.group("year")
score = item.group("score")
num = item.group("num")
f.write(f"{name},{director},{year},{score},{num}\n") # step4 存储数据
print(f"已经爬起了第{page}页")
# step5 收尾工作
f.close()
resp.close()
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册