htmlParser.py 5.4 KB
Newer Older
H
hjdhnx 已提交
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2022/8/25
H
hjdhnx 已提交
6
import json
H
hjdhnx 已提交
7 8 9

from pyquery import PyQuery as pq
from urllib.parse import urljoin
H
hjdhnx 已提交
10
import re
H
hjdhnx 已提交
11
from jsonpath import jsonpath
H
hjdhnx 已提交
12 13 14 15 16

class jsoup:
    def __init__(self,MY_URL=''):
        self.MY_URL = MY_URL

H
hjdhnx 已提交
17
    def test(self, text:str, string:str):
H
hjdhnx 已提交
18 19 20 21
        searchObj = re.search(rf'{text}', string, re.M | re.I)
        test_ret = True if searchObj else False
        return test_ret

H
hjdhnx 已提交
22
    def pdfh(self,html,parse:str,add_url=False):
H
hjdhnx 已提交
23 24
        if not parse:
            return ''
H
hjdhnx 已提交
25 26 27
        doc = pq(html)
        option = None
        if parse.find('&&') > -1:
H
hjdhnx 已提交
28 29 30
            option = parse.split('&&')[-1]
            parse = parse.split('&&')[:-1]  # 如果只有一个&& 取的就直接是0
            if len(parse) > 1:  # 如果不大于1可能就是option操作,不需要拼eq
H
hjdhnx 已提交
31
                parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
H
hjdhnx 已提交
32
            else:
H
hjdhnx 已提交
33
                parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
H
hjdhnx 已提交
34
        # FIXME 暂时不支持jsonpath那样的|| 分割取或属性
H
hjdhnx 已提交
35
        if option:
H
hjdhnx 已提交
36
            # print(f'parse:{parse}=>(option:{option})')
H
hjdhnx 已提交
37
            ret = doc(parse)
H
hjdhnx 已提交
38
            # print(html)
H
hjdhnx 已提交
39
            # FIXME 解析出来有多个的情况应该自动取第一个
H
hjdhnx 已提交
40 41 42 43 44 45
            if option == 'Text':
                ret = ret.text()
            elif option == 'Html':
                ret = ret.html()
            else:
                ret = ret.attr(option)
H
hjdhnx 已提交
46
                if add_url and option in ['url','src','href','data-original','data-src']:
H
hjdhnx 已提交
47 48 49 50 51
                    if 'http' in ret:
                        ret = ret[ret.find('http'):]
                    else:
                        ret = urljoin(self.MY_URL,ret)
                    # print(ret)
H
hjdhnx 已提交
52
        else:
H
hjdhnx 已提交
53 54 55 56 57 58 59
            # ret = doc(parse+':first')
            ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
            # ret = ret.next()  # 取第一条数据
            # ret = doc(parse) # 下面注释的写法不对的
            # ret = ret.find(':first')
            # ret = ret.children(':first')
            ret = str(ret)
H
hjdhnx 已提交
60 61
        return ret

H
hjdhnx 已提交
62
    def pdfa(self,html,parse:str):
H
hjdhnx 已提交
63 64
        if not parse:
            return []
H
hjdhnx 已提交
65 66 67 68 69
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
H
hjdhnx 已提交
70 71 72 73
        doc = pq(html)
        # return [item.html() for item in doc(parse).items()]
        return [str(item) for item in doc(parse).items()]

H
hjdhnx 已提交
74
    def pd(self,html,parse:str):
H
hjdhnx 已提交
75 76
        return self.pdfh(html,parse,True)

H
hjdhnx 已提交
77
    def pq(self,html:str):
H
hjdhnx 已提交
78 79
        return pq(html)

H
hjdhnx 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92
    def pjfh(self,html,parse:str,add_url=False):
        if not parse:
            return ''
        if isinstance(html,str):
            # print(html)
            try:
               html = json.loads(html)
               # html = eval(html)
            except:
                print('字符串转json失败')
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
93 94 95 96 97 98 99 100 101 102 103
        ret = ''
        for ps in parse.split('||'):
            ret = jsonpath(html,ps)
            if isinstance(ret,list):
                ret = str(ret[0]) if ret[0] else ''
            else:
                ret = str(ret) if ret else ''
            if add_url and ret:
                ret = urljoin(self.MY_URL, ret)
            if ret:
                break
H
hjdhnx 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
        return ret

    def pj(self, html, parse:str):
        return self.pjfh(html, parse, True)

    def pjfa(self,html,parse:str):
        if not parse:
            return []
        if isinstance(html,str):
            try:
               html = json.loads(html)
            except:
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
        # print(parse)
        ret = jsonpath(html,parse)
        # print(ret)
        # print(type(ret))
        # print(type(ret[0]))
        # print(len(ret))
        if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
            # print('自动解包')
            ret  = ret[0] # 自动解包
        return ret or []

H
hjdhnx 已提交
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
if __name__ == '__main__':
    import requests
    from parsel import Selector
    url = 'http://360yy.cn'
    jsp = jsoup(url)
    def pdfa2(html,parse):
        if not parse:
            return []
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
            parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
        selector = Selector(text=html)
        print(parse)
        items = selector.css(parse)
        return [str(item) for item in items]
    r = requests.get(url)
    html = r.text
    # parsel 不好用啊,很难实现封装pdfa之类的函数
    items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
    print(items)