htmlParser.py 7.3 KB
Newer Older
H
hjdhnx 已提交
1 2 3 4 5 6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2022/8/25

7
import ujson
H
hjdhnx 已提交
8 9
from pyquery import PyQuery as pq
from urllib.parse import urljoin
H
hjdhnx 已提交
10
import re
H
hjdhnx 已提交
11
from jsonpath import jsonpath
H
hjdhnx 已提交
12

13
PARSE_CACHE = True  # 解析缓存
H
hjdhnx 已提交
14
NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#'  # 不自动加eq下标索引
15 16
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$'  # 需要自动urljoin的属性

17

H
hjdhnx 已提交
18
class jsoup:
19
    def __init__(self, MY_URL=''):
H
hjdhnx 已提交
20
        self.MY_URL = MY_URL
21 22 23 24 25
        self.pdfh_html = ''
        self.pdfa_html = ''

        self.pdfh_doc = None
        self.pdfa_doc = None
H
hjdhnx 已提交
26

27 28 29 30 31 32 33
    def test(self, text: str, string: str):
        """
        正则判断字符串包含,模仿js的 //.test()
        :param text:
        :param string:
        :return:
        """
H
hjdhnx 已提交
34 35 36 37
        searchObj = re.search(rf'{text}', string, re.M | re.I)
        test_ret = True if searchObj else False
        return test_ret

38 39
    def pdfh(self, html, parse: str, add_url=False):
        if not all([html, parse]):
H
hjdhnx 已提交
40
            return ''
41 42 43 44 45 46 47
        if PARSE_CACHE:
            if self.pdfh_html != html:
                self.pdfh_html = html
                self.pdfh_doc = pq(html)
            doc = self.pdfh_doc
        else:
            doc = pq(html)
H
hjdhnx 已提交
48 49 50 51 52
        if parse == 'body&&Text' or parse == 'Text':
            text = doc.text()
            return text
        elif parse == 'body&&Html' or parse == 'Html':
            return doc.html()
53

H
hjdhnx 已提交
54 55
        option = None
        if parse.find('&&') > -1:
H
hjdhnx 已提交
56
            option = parse.split('&&')[-1]
57 58 59 60 61 62 63 64 65
            parse = '&&'.join(parse.split('&&')[:-1])
        parse = self.parseHikerToJq(parse, True)
        # print(f'pdfh:{parse},option:{option}')
        parses = parse.split(' ')
        ret = None
        for nparse in parses:
            ret = self.parseOneRule(doc, nparse, ret)
            # print(nparse,ret)

H
hjdhnx 已提交
66 67 68 69 70 71
        if option:
            if option == 'Text':
                ret = ret.text()
            elif option == 'Html':
                ret = ret.html()
            else:
H
hjdhnx 已提交
72
                ret = ret.attr(option) or ''
73
                if option.lower().find('style') > -1 and ret.find('url(') > -1:
H
hjdhnx 已提交
74
                    try:
75
                        ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
H
hjdhnx 已提交
76 77
                    except:
                        pass
H
hjdhnx 已提交
78 79

                if ret and add_url:
80
                    need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
H
hjdhnx 已提交
81 82 83 84
                    if need_add:
                        if 'http' in ret:
                            ret = ret[ret.find('http'):]
                        else:
85
                            ret = urljoin(self.MY_URL, ret)
H
hjdhnx 已提交
86
        else:
87
            ret = ret.outerHtml()
H
hjdhnx 已提交
88 89
        return ret

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    def parseOneRule(self, doc, nparse, ret=None):
        """
        解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
        :param doc: pq(html) load 后的pq对象
        :param nparse: 当前单个解析表达式
        :param ret: pd对象结果
        :return:
        """
        if self.test(':eq', nparse):
            nparse_rule = nparse.split(':eq')[0]
            nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0]
            try:
                nparse_index = int(nparse_index)
            except:
                nparse_index = 0
H
hjdhnx 已提交
105
            # print(nparse_index)
106 107 108
            if not ret:
                ret = doc(nparse_rule).eq(nparse_index)
            else:
H
hjdhnx 已提交
109
                ret = ret(nparse_rule).eq(nparse_index)
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        else:
            if not ret:
                ret = doc(nparse)
            else:
                ret = ret(nparse)
        return ret

    def parseHikerToJq(self, parse, first=False):
        """
         海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
        :param parse:
        :param first:
        :return:
        """
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            new_parses = []  # 构造新的解析表达式列表
            for i in range(len(parse)):
                ps = parse[i].split(' ')[-1]  # 如果分割&&后带空格就取最后一个元素
                if not self.test(NOADD_INDEX, ps):
                    if not first and i >= len(parse) - 1:  # 不传first且遇到最后一个,不用补eq(0)
                        new_parses.append(parse[i])
                    else:
                        new_parses.append(f'{parse[i]}:eq(0)')
                else:
                    new_parses.append(parse[i])
            parse = ' '.join(new_parses)
H
hjdhnx 已提交
137 138 139 140 141
        else:
            ps = parse.split(' ')[-1]  # 如果带空格就取最后一个元素
            if not self.test(NOADD_INDEX, ps) and first:
                parse = f'{parse}:eq(0)'

142 143 144
        return parse

    def pdfa(self, html, parse: str):
145 146
        # 看官方文档才能解决这个问题!!!
        # https://pyquery.readthedocs.io/en/latest/api.html
147
        if not all([html, parse]):
H
hjdhnx 已提交
148
            return []
149
        parse = self.parseHikerToJq(parse)
150 151 152 153 154 155 156 157
        print(f'pdfa:{parse}')
        if PARSE_CACHE:
            if self.pdfa_html != html:
                self.pdfa_html = html
                self.pdfa_doc = pq(html)
            doc = self.pdfa_doc
        else:
            doc = pq(html)
158 159 160 161 162 163 164

        parses = parse.split(' ')
        ret = None
        for nparse in parses:
            ret = self.parseOneRule(doc, nparse, ret)
            # print(len(ret),nparse)
        res = [item.outerHtml() for item in ret.items()]
165
        return res
H
hjdhnx 已提交
166

167 168
    def pd(self, html, parse: str):
        return self.pdfh(html, parse, True)
H
hjdhnx 已提交
169

170
    def pq(self, html: str):
H
hjdhnx 已提交
171 172
        return pq(html)

173 174
    def pjfh(self, html, parse: str, add_url=False):
        if not all([html, parse]):
H
hjdhnx 已提交
175
            return ''
176
        if isinstance(html, str):
H
hjdhnx 已提交
177 178
            # print(html)
            try:
179 180
                html = ujson.loads(html)
                # html = eval(html)
H
hjdhnx 已提交
181 182 183 184 185
            except:
                print('字符串转json失败')
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
186 187
        ret = ''
        for ps in parse.split('||'):
188 189
            ret = jsonpath(html, ps)
            if isinstance(ret, list):
H
hjdhnx 已提交
190 191 192 193 194 195 196
                ret = str(ret[0]) if ret[0] else ''
            else:
                ret = str(ret) if ret else ''
            if add_url and ret:
                ret = urljoin(self.MY_URL, ret)
            if ret:
                break
H
hjdhnx 已提交
197
        # print(ret)
H
hjdhnx 已提交
198 199
        return ret

200
    def pj(self, html, parse: str):
H
hjdhnx 已提交
201 202
        return self.pjfh(html, parse, True)

203 204
    def pjfa(self, html, parse: str):
        if not all([html, parse]):
H
hjdhnx 已提交
205
            return []
206
        if isinstance(html, str):
H
hjdhnx 已提交
207
            try:
208
                html = ujson.loads(html)
H
hjdhnx 已提交
209
            except:
210
                return []
H
hjdhnx 已提交
211 212
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
213
        # print(html)
H
hjdhnx 已提交
214
        # print(parse)
215
        ret = jsonpath(html, parse)
H
hjdhnx 已提交
216 217 218 219
        # print(ret)
        # print(type(ret))
        # print(type(ret[0]))
        # print(len(ret))
220
        if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
H
hjdhnx 已提交
221
            # print('自动解包')
222
            ret = ret[0]  # 自动解包
H
hjdhnx 已提交
223 224
        return ret or []

H
hjdhnx 已提交
225

226 227
if __name__ == '__main__':
    pass