htmlParser.py 7.0 KB
Newer Older
H
hjdhnx 已提交
1 2 3 4 5 6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2022/8/25

7
import ujson
H
hjdhnx 已提交
8 9
from pyquery import PyQuery as pq
from urllib.parse import urljoin
H
hjdhnx 已提交
10
import re
H
hjdhnx 已提交
11
from jsonpath import jsonpath
H
hjdhnx 已提交
12

13
PARSE_CACHE = True  # 解析缓存
14 15 16
NOADD_INDEX = ':eq|:lt|:gt|^body$|^#'  # 不自动加eq下标索引
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$'  # 需要自动urljoin的属性

17

H
hjdhnx 已提交
18
class jsoup:
19
    def __init__(self, MY_URL=''):
H
hjdhnx 已提交
20
        self.MY_URL = MY_URL
21 22 23 24 25
        self.pdfh_html = ''
        self.pdfa_html = ''

        self.pdfh_doc = None
        self.pdfa_doc = None
H
hjdhnx 已提交
26

27 28 29 30 31 32 33
    def test(self, text: str, string: str):
        """
        正则判断字符串包含,模仿js的 //.test()
        :param text:
        :param string:
        :return:
        """
H
hjdhnx 已提交
34 35 36 37
        searchObj = re.search(rf'{text}', string, re.M | re.I)
        test_ret = True if searchObj else False
        return test_ret

38 39
    def pdfh(self, html, parse: str, add_url=False):
        if not all([html, parse]):
H
hjdhnx 已提交
40
            return ''
41 42 43 44 45 46 47
        if PARSE_CACHE:
            if self.pdfh_html != html:
                self.pdfh_html = html
                self.pdfh_doc = pq(html)
            doc = self.pdfh_doc
        else:
            doc = pq(html)
H
hjdhnx 已提交
48 49 50 51 52
        if parse == 'body&&Text' or parse == 'Text':
            text = doc.text()
            return text
        elif parse == 'body&&Html' or parse == 'Html':
            return doc.html()
53

H
hjdhnx 已提交
54 55
        option = None
        if parse.find('&&') > -1:
H
hjdhnx 已提交
56
            option = parse.split('&&')[-1]
57 58 59 60 61 62 63 64 65
            parse = '&&'.join(parse.split('&&')[:-1])
        parse = self.parseHikerToJq(parse, True)
        # print(f'pdfh:{parse},option:{option}')
        parses = parse.split(' ')
        ret = None
        for nparse in parses:
            ret = self.parseOneRule(doc, nparse, ret)
            # print(nparse,ret)

H
hjdhnx 已提交
66 67 68 69 70 71
        if option:
            if option == 'Text':
                ret = ret.text()
            elif option == 'Html':
                ret = ret.html()
            else:
H
hjdhnx 已提交
72
                ret = ret.attr(option) or ''
73
                if option.lower().find('style') > -1 and ret.find('url(') > -1:
H
hjdhnx 已提交
74
                    try:
75
                        ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
H
hjdhnx 已提交
76 77
                    except:
                        pass
H
hjdhnx 已提交
78 79

                if ret and add_url:
80
                    need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
H
hjdhnx 已提交
81 82 83 84
                    if need_add:
                        if 'http' in ret:
                            ret = ret[ret.find('http'):]
                        else:
85
                            ret = urljoin(self.MY_URL, ret)
H
hjdhnx 已提交
86
        else:
87
            ret = ret.outerHtml()
H
hjdhnx 已提交
88 89
        return ret

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
    def parseOneRule(self, doc, nparse, ret=None):
        """
        解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
        :param doc: pq(html) load 后的pq对象
        :param nparse: 当前单个解析表达式
        :param ret: pd对象结果
        :return:
        """
        if self.test(':eq', nparse):
            nparse_rule = nparse.split(':eq')[0]
            nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0]
            try:
                nparse_index = int(nparse_index)
            except:
                nparse_index = 0
            if not ret:
                ret = doc(nparse_rule).eq(nparse_index)
            else:
                ret = ret(nparse_rule)
        else:
            if not ret:
                ret = doc(nparse)
            else:
                ret = ret(nparse)
        return ret

    def parseHikerToJq(self, parse, first=False):
        """
         海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
        :param parse:
        :param first:
        :return:
        """
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            new_parses = []  # 构造新的解析表达式列表
            for i in range(len(parse)):
                ps = parse[i].split(' ')[-1]  # 如果分割&&后带空格就取最后一个元素
                if not self.test(NOADD_INDEX, ps):
                    if not first and i >= len(parse) - 1:  # 不传first且遇到最后一个,不用补eq(0)
                        new_parses.append(parse[i])
                    else:
                        new_parses.append(f'{parse[i]}:eq(0)')
                else:
                    new_parses.append(parse[i])
            parse = ' '.join(new_parses)
        return parse

    def pdfa(self, html, parse: str):
139 140
        # 看官方文档才能解决这个问题!!!
        # https://pyquery.readthedocs.io/en/latest/api.html
141
        if not all([html, parse]):
H
hjdhnx 已提交
142
            return []
143
        parse = self.parseHikerToJq(parse)
144 145 146 147 148 149 150 151
        print(f'pdfa:{parse}')
        if PARSE_CACHE:
            if self.pdfa_html != html:
                self.pdfa_html = html
                self.pdfa_doc = pq(html)
            doc = self.pdfa_doc
        else:
            doc = pq(html)
152 153 154 155 156 157 158

        parses = parse.split(' ')
        ret = None
        for nparse in parses:
            ret = self.parseOneRule(doc, nparse, ret)
            # print(len(ret),nparse)
        res = [item.outerHtml() for item in ret.items()]
159
        return res
H
hjdhnx 已提交
160

161 162
    def pd(self, html, parse: str):
        return self.pdfh(html, parse, True)
H
hjdhnx 已提交
163

164
    def pq(self, html: str):
H
hjdhnx 已提交
165 166
        return pq(html)

167 168
    def pjfh(self, html, parse: str, add_url=False):
        if not all([html, parse]):
H
hjdhnx 已提交
169
            return ''
170
        if isinstance(html, str):
H
hjdhnx 已提交
171 172
            # print(html)
            try:
173 174
                html = ujson.loads(html)
                # html = eval(html)
H
hjdhnx 已提交
175 176 177 178 179
            except:
                print('字符串转json失败')
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
180 181
        ret = ''
        for ps in parse.split('||'):
182 183
            ret = jsonpath(html, ps)
            if isinstance(ret, list):
H
hjdhnx 已提交
184 185 186 187 188 189 190
                ret = str(ret[0]) if ret[0] else ''
            else:
                ret = str(ret) if ret else ''
            if add_url and ret:
                ret = urljoin(self.MY_URL, ret)
            if ret:
                break
H
hjdhnx 已提交
191
        # print(ret)
H
hjdhnx 已提交
192 193
        return ret

194
    def pj(self, html, parse: str):
H
hjdhnx 已提交
195 196
        return self.pjfh(html, parse, True)

197 198
    def pjfa(self, html, parse: str):
        if not all([html, parse]):
H
hjdhnx 已提交
199
            return []
200
        if isinstance(html, str):
H
hjdhnx 已提交
201
            try:
202
                html = ujson.loads(html)
H
hjdhnx 已提交
203
            except:
204
                return []
H
hjdhnx 已提交
205 206
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
207
        # print(html)
H
hjdhnx 已提交
208
        # print(parse)
209
        ret = jsonpath(html, parse)
H
hjdhnx 已提交
210 211 212 213
        # print(ret)
        # print(type(ret))
        # print(type(ret[0]))
        # print(len(ret))
214
        if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
H
hjdhnx 已提交
215
            # print('自动解包')
216
            ret = ret[0]  # 自动解包
H
hjdhnx 已提交
217 218
        return ret or []

H
hjdhnx 已提交
219

220 221
if __name__ == '__main__':
    pass