htmlParser.py 5.1 KB
Newer Older
H
hjdhnx 已提交
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2022/8/25
H
hjdhnx 已提交
6
import json
H
hjdhnx 已提交
7 8 9

from pyquery import PyQuery as pq
from urllib.parse import urljoin
H
hjdhnx 已提交
10
import re
H
hjdhnx 已提交
11
from jsonpath import jsonpath
H
hjdhnx 已提交
12 13 14 15 16

class jsoup:
    def __init__(self,MY_URL=''):
        self.MY_URL = MY_URL

H
hjdhnx 已提交
17
    def test(self, text:str, string:str):
H
hjdhnx 已提交
18 19 20 21
        searchObj = re.search(rf'{text}', string, re.M | re.I)
        test_ret = True if searchObj else False
        return test_ret

H
hjdhnx 已提交
22
    def pdfh(self,html,parse:str,add_url=False):
H
hjdhnx 已提交
23 24
        if not parse:
            return ''
H
hjdhnx 已提交
25 26 27
        doc = pq(html)
        option = None
        if parse.find('&&') > -1:
H
hjdhnx 已提交
28 29 30 31 32 33
            option = parse.split('&&')[-1]
            parse = parse.split('&&')[:-1]  # 如果只有一个&& 取的就直接是0
            if len(parse) > 1:  # 如果不大于1可能就是option操作,不需要拼eq
                parse = ' '.join([i if self.test(':eq|:lt|:gt',i) else f'{i}:eq(0)' for i in parse])
            else:
                parse = parse[0] if self.test(':eq|:lt|:gt',parse[0]) else f'{parse[0]}:eq(0)'
H
hjdhnx 已提交
34 35

        if option:
H
hjdhnx 已提交
36
            # print(f'parse:{parse}=>(option:{option})')
H
hjdhnx 已提交
37
            ret = doc(parse)
H
hjdhnx 已提交
38
            # FIXME 解析出来有多个的情况应该自动取第一个
H
hjdhnx 已提交
39 40 41 42 43 44
            if option == 'Text':
                ret = ret.text()
            elif option == 'Html':
                ret = ret.html()
            else:
                ret = ret.attr(option)
H
hjdhnx 已提交
45
                if add_url and option in ['url','src','href','data-original','data-src']:
H
hjdhnx 已提交
46 47
                    ret = urljoin(self.MY_URL,ret)
        else:
H
hjdhnx 已提交
48 49 50 51 52 53 54
            # ret = doc(parse+':first')
            ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
            # ret = ret.next()  # 取第一条数据
            # ret = doc(parse) # 下面注释的写法不对的
            # ret = ret.find(':first')
            # ret = ret.children(':first')
            ret = str(ret)
H
hjdhnx 已提交
55 56
        return ret

H
hjdhnx 已提交
57
    def pdfa(self,html,parse:str):
H
hjdhnx 已提交
58 59
        if not parse:
            return []
H
hjdhnx 已提交
60 61 62 63 64
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
H
hjdhnx 已提交
65 66 67 68
        doc = pq(html)
        # return [item.html() for item in doc(parse).items()]
        return [str(item) for item in doc(parse).items()]

H
hjdhnx 已提交
69
    def pd(self,html,parse:str):
H
hjdhnx 已提交
70 71
        return self.pdfh(html,parse,True)

H
hjdhnx 已提交
72
    def pq(self,html:str):
H
hjdhnx 已提交
73 74
        return pq(html)

H
hjdhnx 已提交
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
    def pjfh(self,html,parse:str,add_url=False):
        if not parse:
            return ''
        if isinstance(html,str):
            # print(html)
            try:
               html = json.loads(html)
               # html = eval(html)
            except:
                print('字符串转json失败')
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
        ret = jsonpath(html,parse)
        if isinstance(ret,list):
            ret = str(ret[0]) if ret[0] else ''
        else:
            ret = str(ret) if ret else ''
        if add_url:
            ret = urljoin(self.MY_URL, ret)
        return ret

    def pj(self, html, parse:str):
        return self.pjfh(html, parse, True)

    def pjfa(self,html,parse:str):
        if not parse:
            return []
        if isinstance(html,str):
            try:
               html = json.loads(html)
            except:
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
        # print(parse)
        ret = jsonpath(html,parse)
        # print(ret)
        # print(type(ret))
        # print(type(ret[0]))
        # print(len(ret))
        if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
            # print('自动解包')
            ret  = ret[0] # 自动解包
        return ret or []

H
hjdhnx 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
if __name__ == '__main__':
    import requests
    from parsel import Selector
    url = 'http://360yy.cn'
    jsp = jsoup(url)
    def pdfa2(html,parse):
        if not parse:
            return []
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
            parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
        selector = Selector(text=html)
        print(parse)
        items = selector.css(parse)
        return [str(item) for item in items]
    r = requests.get(url)
    html = r.text
    # parsel 不好用啊,很难实现封装pdfa之类的函数
    items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
    print(items)