htmlParser.py 5.4 KB
Newer Older
H
hjdhnx 已提交
1 2 3 4 5
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File  : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2022/8/25
H
hjdhnx 已提交
6
import json
H
hjdhnx 已提交
7 8 9

from pyquery import PyQuery as pq
from urllib.parse import urljoin
H
hjdhnx 已提交
10
import re
H
hjdhnx 已提交
11
from jsonpath import jsonpath
H
hjdhnx 已提交
12 13 14 15 16

class jsoup:
    def __init__(self,MY_URL=''):
        self.MY_URL = MY_URL

H
hjdhnx 已提交
17
    def test(self, text:str, string:str):
H
hjdhnx 已提交
18 19 20 21
        searchObj = re.search(rf'{text}', string, re.M | re.I)
        test_ret = True if searchObj else False
        return test_ret

H
hjdhnx 已提交
22
    def pdfh(self,html,parse:str,add_url=False):
H
hjdhnx 已提交
23 24
        if not parse:
            return ''
H
hjdhnx 已提交
25

H
hjdhnx 已提交
26 27 28
        doc = pq(html)
        option = None
        if parse.find('&&') > -1:
H
hjdhnx 已提交
29 30 31
            option = parse.split('&&')[-1]
            parse = parse.split('&&')[:-1]  # 如果只有一个&& 取的就直接是0
            if len(parse) > 1:  # 如果不大于1可能就是option操作,不需要拼eq
H
hjdhnx 已提交
32
                parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
H
hjdhnx 已提交
33
            else:
H
hjdhnx 已提交
34
                parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
H
hjdhnx 已提交
35
        # FIXME 暂时不支持jsonpath那样的|| 分割取或属性
H
hjdhnx 已提交
36

H
hjdhnx 已提交
37
        if option:
H
hjdhnx 已提交
38
            print(f'parse:{parse}=>(option:{option})')
H
hjdhnx 已提交
39
            ret = doc(parse)
H
hjdhnx 已提交
40
            # print(html)
H
hjdhnx 已提交
41
            # FIXME 解析出来有多个的情况应该自动取第一个
H
hjdhnx 已提交
42 43 44 45 46
            if option == 'Text':
                ret = ret.text()
            elif option == 'Html':
                ret = ret.html()
            else:
H
hjdhnx 已提交
47 48
                ret = ret.attr(option) or ''
                if ret and add_url and option in ['url','src','href','data-original','data-src']:
H
hjdhnx 已提交
49 50 51 52 53
                    if 'http' in ret:
                        ret = ret[ret.find('http'):]
                    else:
                        ret = urljoin(self.MY_URL,ret)
                    # print(ret)
H
hjdhnx 已提交
54
        else:
H
hjdhnx 已提交
55 56 57 58 59 60 61
            # ret = doc(parse+':first')
            ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
            # ret = ret.next()  # 取第一条数据
            # ret = doc(parse) # 下面注释的写法不对的
            # ret = ret.find(':first')
            # ret = ret.children(':first')
            ret = str(ret)
H
hjdhnx 已提交
62 63
        return ret

H
hjdhnx 已提交
64
    def pdfa(self,html,parse:str):
H
hjdhnx 已提交
65 66
        if not parse:
            return []
H
hjdhnx 已提交
67 68 69 70 71
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
H
hjdhnx 已提交
72 73 74 75
        doc = pq(html)
        # return [item.html() for item in doc(parse).items()]
        return [str(item) for item in doc(parse).items()]

H
hjdhnx 已提交
76
    def pd(self,html,parse:str):
H
hjdhnx 已提交
77 78
        return self.pdfh(html,parse,True)

H
hjdhnx 已提交
79
    def pq(self,html:str):
H
hjdhnx 已提交
80 81
        return pq(html)

H
hjdhnx 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94
    def pjfh(self,html,parse:str,add_url=False):
        if not parse:
            return ''
        if isinstance(html,str):
            # print(html)
            try:
               html = json.loads(html)
               # html = eval(html)
            except:
                print('字符串转json失败')
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
H
hjdhnx 已提交
95 96 97 98 99 100 101 102 103 104 105
        ret = ''
        for ps in parse.split('||'):
            ret = jsonpath(html,ps)
            if isinstance(ret,list):
                ret = str(ret[0]) if ret[0] else ''
            else:
                ret = str(ret) if ret else ''
            if add_url and ret:
                ret = urljoin(self.MY_URL, ret)
            if ret:
                break
H
hjdhnx 已提交
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
        return ret

    def pj(self, html, parse:str):
        return self.pjfh(html, parse, True)

    def pjfa(self,html,parse:str):
        if not parse:
            return []
        if isinstance(html,str):
            try:
               html = json.loads(html)
            except:
                return ''
        if not parse.startswith('$.'):
            parse = f'$.{parse}'
        # print(parse)
        ret = jsonpath(html,parse)
        # print(ret)
        # print(type(ret))
        # print(type(ret[0]))
        # print(len(ret))
        if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
            # print('自动解包')
            ret  = ret[0] # 自动解包
        return ret or []

H
hjdhnx 已提交
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
if __name__ == '__main__':
    import requests
    from parsel import Selector
    url = 'http://360yy.cn'
    jsp = jsoup(url)
    def pdfa2(html,parse):
        if not parse:
            return []
        if parse.find('&&') > -1:
            parse = parse.split('&&')  # 带&&的重新拼接
            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
            # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
            parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
        # print(f'pdfa:{parse}')
        selector = Selector(text=html)
        print(parse)
        items = selector.css(parse)
        return [str(item) for item in items]
    r = requests.get(url)
    html = r.text
    # parsel 不好用啊,很难实现封装pdfa之类的函数
    items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
    print(items)