提交 44300bfd 编写于 作者: H hjdhnx

修改js0相应的解析库pdfa和pdfh,更加精准了,支持eq负数

修复看视界
上级 22778e60
无法预览此类型文件
3.9.20beta8 3.8.8
\ No newline at end of file \ No newline at end of file
var rule = Object.assign(muban.mxone5,{ var rule = Object.assign(muban.mxone5,{
title:'看视界', title:'看视界',
host:'https://www.1080kan.cc', host:'https://www.1080kan.cc',
headers:{'User-Agent':'MOBILE_UA'},
}); });
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
import json
from pyquery import PyQuery as pq
from lxml import etree
from urllib.parse import urljoin
import re
from jsonpath import jsonpath
PARSE_CACHE = True # 解析缓存
class jsoup:
def __init__(self,MY_URL=''):
self.MY_URL = MY_URL
self.pdfh_html = ''
self.pdfa_html = ''
self.pdfh_doc = None
self.pdfa_doc = None
def test(self, text:str, string:str):
searchObj = re.search(rf'{text}', string, re.M | re.I)
test_ret = True if searchObj else False
return test_ret
def pdfh(self,html,parse:str,add_url=False):
if not parse:
return ''
if PARSE_CACHE:
if self.pdfh_html != html:
self.pdfh_html = html
self.pdfh_doc = pq(html)
doc = self.pdfh_doc
else:
doc = pq(html)
if parse == 'body&&Text' or parse == 'Text':
text = doc.text()
return text
elif parse == 'body&&Html' or parse == 'Html':
return doc.html()
option = None
if parse.find('&&') > -1:
option = parse.split('&&')[-1]
parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0
if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq
parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
else:
parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性
if option:
# print(f'parse:{parse}=>(option:{option})')
if ':eq(-1)' in parse:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret = doc(parse.replace(':eq(-1)','')).eq(-1)
else:
ret = doc(parse)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if option == 'Text':
ret = ret.text()
elif option == 'Html':
ret = ret.html()
else:
ret = ret.attr(option) or ''
if option.lower().find('style')>-1 and ret.find('url(')>-1:
try:
ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0]
except:
pass
if ret and add_url:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
# need_add = option in pd_list
need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I)
# print(f'option:{option},need_add:{need_add}')
if need_add:
if 'http' in ret:
ret = ret[ret.find('http'):]
else:
ret = urljoin(self.MY_URL,ret)
# print(ret)
else:
# ret = doc(parse+':first')
ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret = ret.outerHtml()
return ret
def pdfa(self,html,parse:str):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
print(f'pdfa:{parse}')
# print(html)
if PARSE_CACHE:
if self.pdfa_html != html:
self.pdfa_html = html
self.pdfa_doc = pq(html)
doc = self.pdfa_doc
else:
doc = pq(html)
result = doc(parse)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
# print(res)
res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
return res
def pd(self,html,parse:str):
return self.pdfh(html,parse,True)
def pq(self,html:str):
return pq(html)
def pjfh(self,html,parse:str,add_url=False):
if not parse:
return ''
if isinstance(html,str):
# print(html)
try:
html = json.loads(html)
# html = eval(html)
except:
print('字符串转json失败')
return ''
if not parse.startswith('$.'):
parse = f'$.{parse}'
ret = ''
for ps in parse.split('||'):
ret = jsonpath(html,ps)
if isinstance(ret,list):
ret = str(ret[0]) if ret[0] else ''
else:
ret = str(ret) if ret else ''
if add_url and ret:
ret = urljoin(self.MY_URL, ret)
if ret:
break
# print(ret)
return ret
def pj(self, html, parse:str):
return self.pjfh(html, parse, True)
def pjfa(self,html,parse:str):
if not parse:
return []
if isinstance(html,str):
try:
html = json.loads(html)
except:
return ''
if not parse.startswith('$.'):
parse = f'$.{parse}'
# print(html)
# print(parse)
ret = jsonpath(html,parse)
# print(ret)
# print(type(ret))
# print(type(ret[0]))
# print(len(ret))
if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
# print('自动解包')
ret = ret[0] # 自动解包
return ret or []
if __name__ == '__main__':
import requests
from parsel import Selector
url = 'http://360yy.cn'
jsp = jsoup(url)
def pdfa2(html,parse):
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
# print(f'pdfa:{parse}')
selector = Selector(text=html)
print(parse)
items = selector.css(parse)
return [str(item) for item in items]
r = requests.get(url)
html = r.text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
print(items)
...@@ -3,18 +3,20 @@ ...@@ -3,18 +3,20 @@
# File : htmlParser.py # File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25 # Date : 2022/8/25
import json
import ujson
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree
from urllib.parse import urljoin from urllib.parse import urljoin
import re import re
from jsonpath import jsonpath from jsonpath import jsonpath
PARSE_CACHE = True # 解析缓存 PARSE_CACHE = True # 解析缓存
NOADD_INDEX = ':eq|:lt|:gt|^body$|^#' # 不自动加eq下标索引
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性
class jsoup: class jsoup:
def __init__(self,MY_URL=''): def __init__(self, MY_URL=''):
self.MY_URL = MY_URL self.MY_URL = MY_URL
self.pdfh_html = '' self.pdfh_html = ''
self.pdfa_html = '' self.pdfa_html = ''
...@@ -22,13 +24,19 @@ class jsoup: ...@@ -22,13 +24,19 @@ class jsoup:
self.pdfh_doc = None self.pdfh_doc = None
self.pdfa_doc = None self.pdfa_doc = None
def test(self, text:str, string:str): def test(self, text: str, string: str):
"""
正则判断字符串包含,模仿js的 //.test()
:param text:
:param string:
:return:
"""
searchObj = re.search(rf'{text}', string, re.M | re.I) searchObj = re.search(rf'{text}', string, re.M | re.I)
test_ret = True if searchObj else False test_ret = True if searchObj else False
return test_ret return test_ret
def pdfh(self,html,parse:str,add_url=False): def pdfh(self, html, parse: str, add_url=False):
if not parse: if not all([html, parse]):
return '' return ''
if PARSE_CACHE: if PARSE_CACHE:
if self.pdfh_html != html: if self.pdfh_html != html:
...@@ -42,71 +50,98 @@ class jsoup: ...@@ -42,71 +50,98 @@ class jsoup:
return text return text
elif parse == 'body&&Html' or parse == 'Html': elif parse == 'body&&Html' or parse == 'Html':
return doc.html() return doc.html()
option = None option = None
if parse.find('&&') > -1: if parse.find('&&') > -1:
option = parse.split('&&')[-1] option = parse.split('&&')[-1]
parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0 parse = '&&'.join(parse.split('&&')[:-1])
if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq parse = self.parseHikerToJq(parse, True)
parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse]) # print(f'pdfh:{parse},option:{option}')
else: parses = parse.split(' ')
parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)' ret = None
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性 for nparse in parses:
ret = self.parseOneRule(doc, nparse, ret)
# print(nparse,ret)
if option: if option:
# print(f'parse:{parse}=>(option:{option})')
if ':eq(-1)' in parse:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret = doc(parse.replace(':eq(-1)','')).eq(-1)
else:
ret = doc(parse)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if option == 'Text': if option == 'Text':
ret = ret.text() ret = ret.text()
elif option == 'Html': elif option == 'Html':
ret = ret.html() ret = ret.html()
else: else:
ret = ret.attr(option) or '' ret = ret.attr(option) or ''
if option.lower().find('style')>-1 and ret.find('url(')>-1: if option.lower().find('style') > -1 and ret.find('url(') > -1:
try: try:
ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0] ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
except: except:
pass pass
if ret and add_url: if ret and add_url:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|') need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
# need_add = option in pd_list
need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I)
# print(f'option:{option},need_add:{need_add}')
if need_add: if need_add:
if 'http' in ret: if 'http' in ret:
ret = ret[ret.find('http'):] ret = ret[ret.find('http'):]
else: else:
ret = urljoin(self.MY_URL,ret) ret = urljoin(self.MY_URL, ret)
# print(ret)
else: else:
# ret = doc(parse+':first')
ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret = ret.outerHtml() ret = ret.outerHtml()
return ret return ret
def pdfa(self,html,parse:str): def parseOneRule(self, doc, nparse, ret=None):
"""
解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
:param doc: pq(html) load 后的pq对象
:param nparse: 当前单个解析表达式
:param ret: pd对象结果
:return:
"""
if self.test(':eq', nparse):
nparse_rule = nparse.split(':eq')[0]
nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0]
try:
nparse_index = int(nparse_index)
except:
nparse_index = 0
if not ret:
ret = doc(nparse_rule).eq(nparse_index)
else:
ret = ret(nparse_rule)
else:
if not ret:
ret = doc(nparse)
else:
ret = ret(nparse)
return ret
def parseHikerToJq(self, parse, first=False):
"""
海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
:param parse:
:param first:
:return:
"""
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
new_parses = [] # 构造新的解析表达式列表
for i in range(len(parse)):
ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
if not self.test(NOADD_INDEX, ps):
if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
new_parses.append(parse[i])
else:
new_parses.append(f'{parse[i]}:eq(0)')
else:
new_parses.append(parse[i])
parse = ' '.join(new_parses)
return parse
def pdfa(self, html, parse: str):
# 看官方文档才能解决这个问题!!! # 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html # https://pyquery.readthedocs.io/en/latest/api.html
if not parse: if not all([html, parse]):
return [] return []
if parse.find('&&') > -1: parse = self.parseHikerToJq(parse)
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
print(f'pdfa:{parse}') print(f'pdfa:{parse}')
# print(html)
if PARSE_CACHE: if PARSE_CACHE:
if self.pdfa_html != html: if self.pdfa_html != html:
self.pdfa_html = html self.pdfa_html = html
...@@ -114,32 +149,29 @@ class jsoup: ...@@ -114,32 +149,29 @@ class jsoup:
doc = self.pdfa_doc doc = self.pdfa_doc
else: else:
doc = pq(html) doc = pq(html)
result = doc(parse)
# 节点转字符串 parses = parse.split(' ')
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8')) ret = None
# res = [item for item in result.items()] for nparse in parses:
# print(res) ret = self.parseOneRule(doc, nparse, ret)
res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误 # print(len(ret),nparse)
# res = [str(item) for item in result.items()] res = [item.outerHtml() for item in ret.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
return res return res
def pd(self,html,parse:str): def pd(self, html, parse: str):
return self.pdfh(html,parse,True) return self.pdfh(html, parse, True)
def pq(self,html:str): def pq(self, html: str):
return pq(html) return pq(html)
def pjfh(self,html,parse:str,add_url=False): def pjfh(self, html, parse: str, add_url=False):
if not parse: if not all([html, parse]):
return '' return ''
if isinstance(html,str): if isinstance(html, str):
# print(html) # print(html)
try: try:
html = json.loads(html) html = ujson.loads(html)
# html = eval(html) # html = eval(html)
except: except:
print('字符串转json失败') print('字符串转json失败')
return '' return ''
...@@ -147,8 +179,8 @@ class jsoup: ...@@ -147,8 +179,8 @@ class jsoup:
parse = f'$.{parse}' parse = f'$.{parse}'
ret = '' ret = ''
for ps in parse.split('||'): for ps in parse.split('||'):
ret = jsonpath(html,ps) ret = jsonpath(html, ps)
if isinstance(ret,list): if isinstance(ret, list):
ret = str(ret[0]) if ret[0] else '' ret = str(ret[0]) if ret[0] else ''
else: else:
ret = str(ret) if ret else '' ret = str(ret) if ret else ''
...@@ -159,52 +191,31 @@ class jsoup: ...@@ -159,52 +191,31 @@ class jsoup:
# print(ret) # print(ret)
return ret return ret
def pj(self, html, parse:str): def pj(self, html, parse: str):
return self.pjfh(html, parse, True) return self.pjfh(html, parse, True)
def pjfa(self,html,parse:str): def pjfa(self, html, parse: str):
if not parse: if not all([html, parse]):
return [] return []
if isinstance(html,str): if isinstance(html, str):
try: try:
html = json.loads(html) html = ujson.loads(html)
except: except:
return '' return []
if not parse.startswith('$.'): if not parse.startswith('$.'):
parse = f'$.{parse}' parse = f'$.{parse}'
# print(html) # print(html)
# print(parse) # print(parse)
ret = jsonpath(html,parse) ret = jsonpath(html, parse)
# print(ret) # print(ret)
# print(type(ret)) # print(type(ret))
# print(type(ret[0])) # print(type(ret[0]))
# print(len(ret)) # print(len(ret))
if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1: if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
# print('自动解包') # print('自动解包')
ret = ret[0] # 自动解包 ret = ret[0] # 自动解包
return ret or [] return ret or []
if __name__ == '__main__':
import requests
from parsel import Selector
url = 'http://360yy.cn'
jsp = jsoup(url)
def pdfa2(html,parse):
if not parse:
return []
if parse.find('&&') > -1:
parse = parse.split('&&') # 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
# print(f'pdfa:{parse}')
selector = Selector(text=html)
print(parse)
items = selector.css(parse)
return [str(item) for item in items]
r = requests.get(url)
html = r.text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
print(items)
if __name__ == '__main__':
pass
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册