提交 452b6f14 编写于 作者: 骆昊的技术专栏's avatar 骆昊的技术专栏

更新了爬虫第1天代码

上级 402e0564
from urllib.error import URLError
from urllib.request import urlopen
import re
import pymysql
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
try:
for charset in charsets:
try:
html = urlopen(start_url).read().decode(charset)
break
except UnicodeDecodeError:
html = None
except URLError as ex:
print('Error:', ex)
return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
retry_times > 0 else None
return html
def main():
url_list = ['http://sports.sohu.com/nba_a.shtml']
visited_list = set({})
while len(url_list) > 0:
current_url = url_list.pop(0)
visited_list.add(current_url)
print(current_url)
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
if html:
link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
link_list = re.findall(link_regex, html)
url_list += link_list
conn = pymysql.connect(host='localhost', port=3306,
db='crawler', user='root',
passwd='123456', charset='utf8')
try:
for link in link_list:
if link not in visited_list:
visited_list.add(link)
print(link)
html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
if html:
title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
match_list = title_regex.findall(html)
if len(match_list) > 0:
title = match_list[0]
with conn.cursor() as cursor:
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
(title, link))
conn.commit()
finally:
conn.close()
print('执行完成!')
if __name__ == '__main__':
main()
from bs4 import BeautifulSoup
import re
def main():
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>首页</title>
</head>
<body>
<h1>Hello, world!</h1>
<p>Good!!!</p>
<hr>
<div>
<h2>这是一个例子程序</h2>
<p>静夜思</p>
<p class="foo">床前明月光</p>
<p id="bar">疑似地上霜</p>
<p class="foo">举头望明月</p>
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
</div>
<a class="foo" href="http://www.qq.com">腾讯网</a>
<img src="./img/pretty-girl.png" alt="美女">
<img src="./img/hellokitty.png" alt="凯蒂猫">
<img src="./static/img/pretty-girl.png" alt="美女">
<goup>Hello, Goup!</goup>
</body>
</html>
"""
# resp = requests.get('http://sports.sohu.com/nba_a.shtml')
# html = resp.content.decode('gbk')
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
# JavaScript: document.body.h1
# JavaScript: document.forms[0]
print(soup.body.h1)
print(soup.find_all(re.compile(r'p$')))
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
print(soup.find_all(lambda x: len(x.attrs) == 2))
print(soup.find_all('p', {'class': 'foo'}))
for elem in soup.select('a[href]'):
print(elem.attrs['href'])
if __name__ == '__main__':
main()
from bs4 import BeautifulSoup
import requests
import re
def main():
# 通过requests第三方库的get方法获取页面
resp = requests.get('http://sports.sohu.com/nba_a.shtml')
# 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码)
html = resp.content.decode('gbk')
# 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM)
bs = BeautifulSoup(html, 'lxml')
# 通过CSS选择器语法查找元素并通过循环进行处理
# for elem in bs.find_all(lambda x: 'test' in x.attrs):
for elem in bs.select('a[test]'):
# 通过attrs属性(字典)获取元素的属性值
link_url = elem.attrs['href']
resp = requests.get(link_url)
bs_sub = BeautifulSoup(resp.text, 'lxml')
# 使用正则表达式对获取的数据做进一步的处理
print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text))
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册