提交 44300bfd 编写于 作者: H hjdhnx

修改js0相应的解析库pdfa和pdfh,更加精准了,支持eq负数

修复看视界
上级 22778e60
无法预览此类型文件
3.9.20beta8
\ No newline at end of file
3.8.8
\ No newline at end of file
var rule = Object.assign(muban.mxone5,{
title:'看视界',
host:'https://www.1080kan.cc',
headers:{'User-Agent':'MOBILE_UA'},
});
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
import json
from pyquery import PyQuery as pq
from lxml import etree
from urllib.parse import urljoin
import re
from jsonpath import jsonpath
PARSE_CACHE = True # 解析缓存
class jsoup:
def __init__(self,MY_URL=''):
self.MY_URL = MY_URL
self.pdfh_html = ''
self.pdfa_html = ''
self.pdfh_doc = None
self.pdfa_doc = None
def test(self, text:str, string:str):
searchObj = re.search(rf'{text}', string, re.M | re.I)
test_ret = True if searchObj else False
return test_ret
def pdfh(self,html,parse:str,add_url=False):
if not parse:
return ''
if PARSE_CACHE:
if self.pdfh_html != html:
self.pdfh_html = html
self.pdfh_doc = pq(html)
doc = self.pdfh_doc
else:
doc = pq(html)
if parse == 'body&&Text' or parse == 'Text':
text = doc.text()
return text
elif parse == 'body&&Html' or parse == 'Html':
return doc.html()
option = None
if parse.find('&&') > -1:
option = parse.split('&&')[-1]
parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0
if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq
parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
else:
parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性
if option:
# print(f'parse:{parse}=>(option:{option})')
if ':eq(-1)' in parse:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret = doc(parse.replace(':eq(-1)','')).eq(-1)
else:
ret = doc(parse)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if option == 'Text':
ret = ret.text()
elif option == 'Html':
ret = ret.html()
else:
ret = ret.attr(option) or ''
if option.lower().find('style')>-1 and ret.find('url(')>-1:
try:
ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0]
except:
pass
if ret and add_url:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
# need_add = option in pd_list
need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I)
# print(f'option:{option},need_add:{need_add}')
if need_add:
if 'http' in ret:
ret = ret[ret.find('http'):]
else:
ret = urljoin(self.MY_URL,ret)
# print(ret)
else:
# ret = doc(parse+':first')
ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret = ret.outerHtml()
return ret
def pdfa(self,html,parse:str):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
print(f'pdfa:{parse}')
# print(html)
if PARSE_CACHE:
if self.pdfa_html != html:
self.pdfa_html = html
self.pdfa_doc = pq(html)
doc = self.pdfa_doc
else:
doc = pq(html)
result = doc(parse)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
# print(res)
res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
return res
def pd(self,html,parse:str):
return self.pdfh(html,parse,True)
def pq(self,html:str):
return pq(html)
def pjfh(self,html,parse:str,add_url=False):
if not parse:
return ''
if isinstance(html,str):
# print(html)
try:
html = json.loads(html)
# html = eval(html)
except:
print('字符串转json失败')
return ''
if not parse.startswith('$.'):
parse = f'$.{parse}'
ret = ''
for ps in parse.split('||'):
ret = jsonpath(html,ps)
if isinstance(ret,list):
ret = str(ret[0]) if ret[0] else ''
else:
ret = str(ret) if ret else ''
if add_url and ret:
ret = urljoin(self.MY_URL, ret)
if ret:
break
# print(ret)
return ret
def pj(self, html, parse:str):
return self.pjfh(html, parse, True)
def pjfa(self,html,parse:str):
if not parse:
return []
if isinstance(html,str):
try:
html = json.loads(html)
except:
return ''
if not parse.startswith('$.'):
parse = f'$.{parse}'
# print(html)
# print(parse)
ret = jsonpath(html,parse)
# print(ret)
# print(type(ret))
# print(type(ret[0]))
# print(len(ret))
if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
# print('自动解包')
ret = ret[0] # 自动解包
return ret or []
if __name__ == '__main__':
import requests
from parsel import Selector
url = 'http://360yy.cn'
jsp = jsoup(url)
def pdfa2(html,parse):
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
# print(f'pdfa:{parse}')
selector = Selector(text=html)
print(parse)
items = selector.css(parse)
return [str(item) for item in items]
r = requests.get(url)
html = r.text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
print(items)
......@@ -3,18 +3,20 @@
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
import json
import ujson
from pyquery import PyQuery as pq
from lxml import etree
from urllib.parse import urljoin
import re
from jsonpath import jsonpath
PARSE_CACHE = True # 解析缓存
NOADD_INDEX = ':eq|:lt|:gt|^body$|^#' # 不自动加eq下标索引
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性
class jsoup:
def __init__(self,MY_URL=''):
def __init__(self, MY_URL=''):
self.MY_URL = MY_URL
self.pdfh_html = ''
self.pdfa_html = ''
......@@ -22,13 +24,19 @@ class jsoup:
self.pdfh_doc = None
self.pdfa_doc = None
def test(self, text:str, string:str):
def test(self, text: str, string: str):
"""
正则判断字符串包含,模仿js的 //.test()
:param text:
:param string:
:return:
"""
searchObj = re.search(rf'{text}', string, re.M | re.I)
test_ret = True if searchObj else False
return test_ret
def pdfh(self,html,parse:str,add_url=False):
if not parse:
def pdfh(self, html, parse: str, add_url=False):
if not all([html, parse]):
return ''
if PARSE_CACHE:
if self.pdfh_html != html:
......@@ -42,71 +50,98 @@ class jsoup:
return text
elif parse == 'body&&Html' or parse == 'Html':
return doc.html()
option = None
if parse.find('&&') > -1:
option = parse.split('&&')[-1]
parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0
if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq
parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
else:
parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性
parse = '&&'.join(parse.split('&&')[:-1])
parse = self.parseHikerToJq(parse, True)
# print(f'pdfh:{parse},option:{option}')
parses = parse.split(' ')
ret = None
for nparse in parses:
ret = self.parseOneRule(doc, nparse, ret)
# print(nparse,ret)
if option:
# print(f'parse:{parse}=>(option:{option})')
if ':eq(-1)' in parse:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret = doc(parse.replace(':eq(-1)','')).eq(-1)
else:
ret = doc(parse)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if option == 'Text':
ret = ret.text()
elif option == 'Html':
ret = ret.html()
else:
ret = ret.attr(option) or ''
if option.lower().find('style')>-1 and ret.find('url(')>-1:
if option.lower().find('style') > -1 and ret.find('url(') > -1:
try:
ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0]
ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
except:
pass
if ret and add_url:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
# need_add = option in pd_list
need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I)
# print(f'option:{option},need_add:{need_add}')
need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
if need_add:
if 'http' in ret:
ret = ret[ret.find('http'):]
else:
ret = urljoin(self.MY_URL,ret)
# print(ret)
ret = urljoin(self.MY_URL, ret)
else:
# ret = doc(parse+':first')
ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret = ret.outerHtml()
return ret
def pdfa(self,html,parse:str):
def parseOneRule(self, doc, nparse, ret=None):
"""
解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
:param doc: pq(html) load 后的pq对象
:param nparse: 当前单个解析表达式
:param ret: pd对象结果
:return:
"""
if self.test(':eq', nparse):
nparse_rule = nparse.split(':eq')[0]
nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0]
try:
nparse_index = int(nparse_index)
except:
nparse_index = 0
if not ret:
ret = doc(nparse_rule).eq(nparse_index)
else:
ret = ret(nparse_rule)
else:
if not ret:
ret = doc(nparse)
else:
ret = ret(nparse)
return ret
def parseHikerToJq(self, parse, first=False):
"""
海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
:param parse:
:param first:
:return:
"""
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
new_parses = [] # 构造新的解析表达式列表
for i in range(len(parse)):
ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
if not self.test(NOADD_INDEX, ps):
if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
new_parses.append(parse[i])
else:
new_parses.append(f'{parse[i]}:eq(0)')
else:
new_parses.append(parse[i])
parse = ' '.join(new_parses)
return parse
def pdfa(self, html, parse: str):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if not parse:
if not all([html, parse]):
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse = self.parseHikerToJq(parse)
print(f'pdfa:{parse}')
# print(html)
if PARSE_CACHE:
if self.pdfa_html != html:
self.pdfa_html = html
......@@ -114,32 +149,29 @@ class jsoup:
doc = self.pdfa_doc
else:
doc = pq(html)
result = doc(parse)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
# print(res)
res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
parses = parse.split(' ')
ret = None
for nparse in parses:
ret = self.parseOneRule(doc, nparse, ret)
# print(len(ret),nparse)
res = [item.outerHtml() for item in ret.items()]
return res
def pd(self,html,parse:str):
return self.pdfh(html,parse,True)
def pd(self, html, parse: str):
return self.pdfh(html, parse, True)
def pq(self,html:str):
def pq(self, html: str):
return pq(html)
def pjfh(self,html,parse:str,add_url=False):
if not parse:
def pjfh(self, html, parse: str, add_url=False):
if not all([html, parse]):
return ''
if isinstance(html,str):
if isinstance(html, str):
# print(html)
try:
html = json.loads(html)
# html = eval(html)
html = ujson.loads(html)
# html = eval(html)
except:
print('字符串转json失败')
return ''
......@@ -147,8 +179,8 @@ class jsoup:
parse = f'$.{parse}'
ret = ''
for ps in parse.split('||'):
ret = jsonpath(html,ps)
if isinstance(ret,list):
ret = jsonpath(html, ps)
if isinstance(ret, list):
ret = str(ret[0]) if ret[0] else ''
else:
ret = str(ret) if ret else ''
......@@ -159,52 +191,31 @@ class jsoup:
# print(ret)
return ret
def pj(self, html, parse:str):
def pj(self, html, parse: str):
return self.pjfh(html, parse, True)
def pjfa(self,html,parse:str):
if not parse:
def pjfa(self, html, parse: str):
if not all([html, parse]):
return []
if isinstance(html,str):
if isinstance(html, str):
try:
html = json.loads(html)
html = ujson.loads(html)
except:
return ''
return []
if not parse.startswith('$.'):
parse = f'$.{parse}'
# print(html)
# print(parse)
ret = jsonpath(html,parse)
ret = jsonpath(html, parse)
# print(ret)
# print(type(ret))
# print(type(ret[0]))
# print(len(ret))
if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
# print('自动解包')
ret = ret[0] # 自动解包
ret = ret[0] # 自动解包
return ret or []
if __name__ == '__main__':
import requests
from parsel import Selector
url = 'http://360yy.cn'
jsp = jsoup(url)
def pdfa2(html,parse):
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
# print(f'pdfa:{parse}')
selector = Selector(text=html)
print(parse)
items = selector.css(parse)
return [str(item) for item in items]
r = requests.get(url)
html = r.text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
print(items)
if __name__ == '__main__':
pass
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册