example01.py 3.1 KB
Newer Older
1 2 3 4 5
from urllib.error import URLError
from urllib.request import urlopen

import re
import pymysql
6
import ssl
7

8
from pymysql import Error
9

10

11 12
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
def decode_page(page_bytes, charsets=('utf-8',)):
13 14 15 16 17 18 19 20 21 22 23
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('Decode:', error)
    return page_html


24 25
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
26
    page_html = None
27
    try:
28 29 30 31 32 33 34 35 36
        page_html = decode_page(urlopen(seed_url).read(), charsets)
    except URLError:
        # logging.error('URL:', error)
        if retry_times > 0:
            return get_page_html(seed_url, retry_times=retry_times - 1,
                                 charsets=charsets)
    return page_html


37
# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
38 39 40 41 42
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
    pattern_regex = re.compile(pattern_str, pattern_ignore_case)
    return pattern_regex.findall(page_html) if page_html else []


43 44
# 开始执行爬虫程序并对指定的数据进行持久化操作
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
45 46 47 48 49 50
    conn = pymysql.connect(host='localhost', port=3306,
                           database='crawler', user='root',
                           password='123456', charset='utf8')
    try:
        with conn.cursor() as cursor:
            url_list = [seed_url]
51
            visited_url_list = {seed_url: 0}
52 53
            while url_list:
                current_url = url_list.pop(0)
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
                depth = visited_url_list[current_url]
                if depth != max_depth:
                    page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
                    links_list = get_matched_parts(page_html, match_pattern)
                    param_list = []
                    for link in links_list:
                        if link not in visited_url_list:
                            visited_url_list[link] = depth + 1
                            page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))
                            headings = get_matched_parts(page_html, r'<h1>(.*)<span')
                            if headings:
                                param_list.append((headings[0], link))
                    cursor.executemany('insert into tb_result values (default, %s, %s)',
                                       param_list)
                    conn.commit()
69 70 71 72 73
    except Error:
        pass
        # logging.error('SQL:', error)
    finally:
        conn.close()
74 75 76


def main():
77
    ssl._create_default_https_context = ssl._create_unverified_context
78 79 80
    start_crawl('http://sports.sohu.com/nba_a.shtml',
                r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
                max_depth=2)
81 82 83 84


if __name__ == '__main__':
    main()