From 452b6f14410d992af1e894153d0ae28a86ac0677 Mon Sep 17 00:00:00 2001 From: jackfrued Date: Mon, 28 May 2018 17:31:32 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E7=AC=AC1=E5=A4=A9=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Day66-75/code/example01.py | 60 ++++++++++++++++++++++++++++++++++++++ Day66-75/code/example02.py | 50 +++++++++++++++++++++++++++++++ Day66-75/code/example03.py | 27 +++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 Day66-75/code/example01.py create mode 100644 Day66-75/code/example02.py create mode 100644 Day66-75/code/example03.py diff --git a/Day66-75/code/example01.py b/Day66-75/code/example01.py new file mode 100644 index 0000000..86db236 --- /dev/null +++ b/Day66-75/code/example01.py @@ -0,0 +1,60 @@ +from urllib.error import URLError +from urllib.request import urlopen + +import re +import pymysql + + +def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )): + try: + for charset in charsets: + try: + html = urlopen(start_url).read().decode(charset) + break + except UnicodeDecodeError: + html = None + except URLError as ex: + print('Error:', ex) + return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \ + retry_times > 0 else None + return html + + +def main(): + url_list = ['http://sports.sohu.com/nba_a.shtml'] + visited_list = set({}) + while len(url_list) > 0: + current_url = url_list.pop(0) + visited_list.add(current_url) + print(current_url) + html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312')) + if html: + link_regex = re.compile(r']+href=["\'](.*?)["\']', re.IGNORECASE) + link_list = re.findall(link_regex, html) + url_list += link_list + conn = pymysql.connect(host='localhost', port=3306, + db='crawler', user='root', + passwd='123456', charset='utf8') + try: + for link in link_list: + if link not in visited_list: + visited_list.add(link) + print(link) + html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312')) + if html: + title_regex = re.compile(r'

(.*) 0: + title = match_list[0] + with conn.cursor() as cursor: + cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)', + (title, link)) + conn.commit() + finally: + conn.close() + print('执行完成!') + + +if __name__ == '__main__': + main() + diff --git a/Day66-75/code/example02.py b/Day66-75/code/example02.py new file mode 100644 index 0000000..e248f12 --- /dev/null +++ b/Day66-75/code/example02.py @@ -0,0 +1,50 @@ +from bs4 import BeautifulSoup + +import re + + +def main(): + html = """ + + + + + 首页 + + +

Hello, world!

+

Good!!!

+
+
+

这是一个例子程序

+

静夜思

+

床前明月光

+

疑似地上霜

+

举头望明月

+ +
+ 腾讯网 + 美女 + 凯蒂猫 + 美女 + Hello, Goup! + + + """ + # resp = requests.get('http://sports.sohu.com/nba_a.shtml') + # html = resp.content.decode('gbk') + soup = BeautifulSoup(html, 'lxml') + print(soup.title) + # JavaScript: document.body.h1 + # JavaScript: document.forms[0] + print(soup.body.h1) + print(soup.find_all(re.compile(r'p$'))) + print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')})) + print(soup.find_all(lambda x: len(x.attrs) == 2)) + print(soup.find_all('p', {'class': 'foo'})) + for elem in soup.select('a[href]'): + print(elem.attrs['href']) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/example03.py b/Day66-75/code/example03.py new file mode 100644 index 0000000..42fc04a --- /dev/null +++ b/Day66-75/code/example03.py @@ -0,0 +1,27 @@ +from bs4 import BeautifulSoup + +import requests + +import re + + +def main(): + # 通过requests第三方库的get方法获取页面 + resp = requests.get('http://sports.sohu.com/nba_a.shtml') + # 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码) + html = resp.content.decode('gbk') + # 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM) + bs = BeautifulSoup(html, 'lxml') + # 通过CSS选择器语法查找元素并通过循环进行处理 + # for elem in bs.find_all(lambda x: 'test' in x.attrs): + for elem in bs.select('a[test]'): + # 通过attrs属性(字典)获取元素的属性值 + link_url = elem.attrs['href'] + resp = requests.get(link_url) + bs_sub = BeautifulSoup(resp.text, 'lxml') + # 使用正则表达式对获取的数据做进一步的处理 + print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text)) + + +if __name__ == '__main__': + main() -- GitLab