From 44300bfd16e7b7a44b8d623820e3ea8e6064f175 Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Mon, 14 Nov 2022 12:28:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9js0=E7=9B=B8=E5=BA=94?= =?UTF-8?q?=E7=9A=84=E8=A7=A3=E6=9E=90=E5=BA=93pdfa=E5=92=8Cpdfh=EF=BC=8C?= =?UTF-8?q?=E6=9B=B4=E5=8A=A0=E7=B2=BE=E5=87=86=E4=BA=86=EF=BC=8C=E6=94=AF?= =?UTF-8?q?=E6=8C=81eq=E8=B4=9F=E6=95=B0=20=E4=BF=AE=E5=A4=8D=E7=9C=8B?= =?UTF-8?q?=E8=A7=86=E7=95=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/rules.db | Bin 155648 -> 155648 bytes js/version.txt | 2 +- "js/\347\234\213\350\247\206\347\225\214.js" | 1 + utils/htmlParseerOld.py | 210 +++++++++++++++++++ utils/htmlParser.py | 203 +++++++++--------- 5 files changed, 319 insertions(+), 97 deletions(-) create mode 100644 utils/htmlParseerOld.py diff --git a/base/rules.db b/base/rules.db index 0e258cc744ce58a58e3a36db4711252258b84def..8d21abea992a12d4ec8377e78c15cf25b9bd077f 100644 GIT binary patch delta 652 zcmZ9JUr1AN6vuz(-n-r1dk2B7(CNB+&|d>{>|QtLT_BB>plCV_DvQ`3M}g2B7zPV* z2|^a+W!W#(NCQ&{!#UyiP?Gsj&p!0hgF&=EfiKZRK1Fv7$~x!admav)@8^3awuOmp zVWu9!dP4}pxyZQ842 zfa-VqRF9^5sWb#G;Ev=`H}xmrj>s7~^irCHZfCvq2XdGLtoTTBTYvHg{7Zg_cbIq0 zQ|17%%s_)^35t8xqrw%-eoj9yk&!46(invv%Et_JTju0llZcwqmn&43a&D?Ty_Nm- z0cWq;Exvyk)b@dpQ3E(tmD^S6s!F&j1`&zQgWaxrRJBFUw|n zL{1=mOOW}S*`Xt3JOCGnHV0OE_BnhsRU&g{cXw>AHceH1ZKx!(LFPY`hcZVcVwkL@ Jsa1GLe*-Zm%mV-b delta 543 zcmYk2KWGzS7>Dn7$u)PEK+#%hFeVu^2({tv%jNH4TQx#yY;=;iM6^}fB?X--k|+jo zF-Np-O(@2K)j`2C=!ak-261!f;95mYlR!2%Cl3eh_QJ!%@8v1BlwwP%974wQokYgU z-wL>H2!e31G<~NN;GF;h8upQtMF>m$coqQ_i!_^qmptc@q@`&n)$i>gNlUP(!QbRC zCn>fc5t_nqljJ%VQ@D-{J*;7dyRYCFoM%c`RUV(mO_kR@Bl18XUj;tZ z9aeWdBg#d2z$YvCEm@6EA|I^b8N8lfLPj`hzo<-2$=!Qa|1R%rS-t0FG6|znG{IPW z@3~XMxv|@l`p@ZE<=*`*+vz{vph_L1^kofhTCHPPVlgdl@Rd4NF!O(9_bu<%sZ)9R z^Zh2%7@N?w36==zT8hO|5j|`~H5S+R5|+}9l)=I*$}}U!C)d#y?b?n8UWu}oUN;c( z|n`XA^w-lOtiV`iRaE+2Rc=)Lee3i)VFy@IR e8H>h*uAL%S-EJR{V~K>O8^Z2i%$Hx_0sjN2Zl#3) diff --git a/js/version.txt b/js/version.txt index 596757a..5553d8c 100644 --- a/js/version.txt +++ b/js/version.txt @@ -1 +1 @@ -3.9.20beta8 \ No newline at end of file +3.8.8 \ No newline at end of file diff --git "a/js/\347\234\213\350\247\206\347\225\214.js" "b/js/\347\234\213\350\247\206\347\225\214.js" index bafa3ad..ce668a5 100644 --- "a/js/\347\234\213\350\247\206\347\225\214.js" +++ "b/js/\347\234\213\350\247\206\347\225\214.js" @@ -1,4 +1,5 @@ var rule = Object.assign(muban.mxone5,{ title:'看视界', host:'https://www.1080kan.cc', +headers:{'User-Agent':'MOBILE_UA'}, }); \ No newline at end of file diff --git a/utils/htmlParseerOld.py b/utils/htmlParseerOld.py new file mode 100644 index 0000000..6ae0a09 --- /dev/null +++ b/utils/htmlParseerOld.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# File : htmlParser.py +# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ +# Date : 2022/8/25 +import json + +from pyquery import PyQuery as pq +from lxml import etree +from urllib.parse import urljoin +import re +from jsonpath import jsonpath + +PARSE_CACHE = True # 解析缓存 + +class jsoup: + def __init__(self,MY_URL=''): + self.MY_URL = MY_URL + self.pdfh_html = '' + self.pdfa_html = '' + + self.pdfh_doc = None + self.pdfa_doc = None + + def test(self, text:str, string:str): + searchObj = re.search(rf'{text}', string, re.M | re.I) + test_ret = True if searchObj else False + return test_ret + + def pdfh(self,html,parse:str,add_url=False): + if not parse: + return '' + if PARSE_CACHE: + if self.pdfh_html != html: + self.pdfh_html = html + self.pdfh_doc = pq(html) + doc = self.pdfh_doc + else: + doc = pq(html) + if parse == 'body&&Text' or parse == 'Text': + text = doc.text() + return text + elif parse == 'body&&Html' or parse == 'Html': + return doc.html() + option = None + if parse.find('&&') > -1: + option = parse.split('&&')[-1] + parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0 + if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq + parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse]) + else: + parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)' + # FIXME 暂时不支持jsonpath那样的|| 分割取或属性 + if option: + # print(f'parse:{parse}=>(option:{option})') + if ':eq(-1)' in parse: + # 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq + ret = doc(parse.replace(':eq(-1)','')).eq(-1) + else: + ret = doc(parse) + # print(html) + # FIXME 解析出来有多个的情况应该自动取第一个 + if option == 'Text': + ret = ret.text() + elif option == 'Html': + ret = ret.html() + else: + ret = ret.attr(option) or '' + if option.lower().find('style')>-1 and ret.find('url(')>-1: + try: + ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0] + except: + pass + + if ret and add_url: + # pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|') + # need_add = option in pd_list + + need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I) + # print(f'option:{option},need_add:{need_add}') + if need_add: + if 'http' in ret: + ret = ret[ret.find('http'):] + else: + ret = urljoin(self.MY_URL,ret) + # print(ret) + else: + # ret = doc(parse+':first') + ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next + # ret = ret.next() # 取第一条数据 + # ret = doc(parse) # 下面注释的写法不对的 + # ret = ret.find(':first') + # ret = ret.children(':first') + # print(parse) + # ret = str(ret) + ret = ret.outerHtml() + return ret + + def pdfa(self,html,parse:str): + # 看官方文档才能解决这个问题!!! + # https://pyquery.readthedocs.io/en/latest/api.html + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + print(f'pdfa:{parse}') + # print(html) + if PARSE_CACHE: + if self.pdfa_html != html: + self.pdfa_html = html + self.pdfa_doc = pq(html) + doc = self.pdfa_doc + else: + doc = pq(html) + result = doc(parse) + # 节点转字符串 + # print(str(etree.tostring(result[0], pretty_print=True), 'utf-8')) + # res = [item for item in result.items()] + # print(res) + res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误 + # res = [str(item) for item in result.items()] + # res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result] + # print(len(res),res) + # print('pdfa执行结果数:',len(res)) + return res + + def pd(self,html,parse:str): + return self.pdfh(html,parse,True) + + def pq(self,html:str): + return pq(html) + + def pjfh(self,html,parse:str,add_url=False): + if not parse: + return '' + if isinstance(html,str): + # print(html) + try: + html = json.loads(html) + # html = eval(html) + except: + print('字符串转json失败') + return '' + if not parse.startswith('$.'): + parse = f'$.{parse}' + ret = '' + for ps in parse.split('||'): + ret = jsonpath(html,ps) + if isinstance(ret,list): + ret = str(ret[0]) if ret[0] else '' + else: + ret = str(ret) if ret else '' + if add_url and ret: + ret = urljoin(self.MY_URL, ret) + if ret: + break + # print(ret) + return ret + + def pj(self, html, parse:str): + return self.pjfh(html, parse, True) + + def pjfa(self,html,parse:str): + if not parse: + return [] + if isinstance(html,str): + try: + html = json.loads(html) + except: + return '' + if not parse.startswith('$.'): + parse = f'$.{parse}' + # print(html) + # print(parse) + ret = jsonpath(html,parse) + # print(ret) + # print(type(ret)) + # print(type(ret[0])) + # print(len(ret)) + if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1: + # print('自动解包') + ret = ret[0] # 自动解包 + return ret or [] + +if __name__ == '__main__': + import requests + from parsel import Selector + url = 'http://360yy.cn' + jsp = jsoup(url) + def pdfa2(html,parse): + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) + # print(f'pdfa:{parse}') + selector = Selector(text=html) + print(parse) + items = selector.css(parse) + return [str(item) for item in items] + r = requests.get(url) + html = r.text + # parsel 不好用啊,很难实现封装pdfa之类的函数 + items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') + print(items) + diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 6ae0a09..4012e31 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -3,18 +3,20 @@ # File : htmlParser.py # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ # Date : 2022/8/25 -import json +import ujson from pyquery import PyQuery as pq -from lxml import etree from urllib.parse import urljoin import re from jsonpath import jsonpath PARSE_CACHE = True # 解析缓存 +NOADD_INDEX = ':eq|:lt|:gt|^body$|^#' # 不自动加eq下标索引 +URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性 + class jsoup: - def __init__(self,MY_URL=''): + def __init__(self, MY_URL=''): self.MY_URL = MY_URL self.pdfh_html = '' self.pdfa_html = '' @@ -22,13 +24,19 @@ class jsoup: self.pdfh_doc = None self.pdfa_doc = None - def test(self, text:str, string:str): + def test(self, text: str, string: str): + """ + 正则判断字符串包含,模仿js的 //.test() + :param text: + :param string: + :return: + """ searchObj = re.search(rf'{text}', string, re.M | re.I) test_ret = True if searchObj else False return test_ret - def pdfh(self,html,parse:str,add_url=False): - if not parse: + def pdfh(self, html, parse: str, add_url=False): + if not all([html, parse]): return '' if PARSE_CACHE: if self.pdfh_html != html: @@ -42,71 +50,98 @@ class jsoup: return text elif parse == 'body&&Html' or parse == 'Html': return doc.html() + option = None if parse.find('&&') > -1: option = parse.split('&&')[-1] - parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0 - if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq - parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse]) - else: - parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)' - # FIXME 暂时不支持jsonpath那样的|| 分割取或属性 + parse = '&&'.join(parse.split('&&')[:-1]) + parse = self.parseHikerToJq(parse, True) + # print(f'pdfh:{parse},option:{option}') + parses = parse.split(' ') + ret = None + for nparse in parses: + ret = self.parseOneRule(doc, nparse, ret) + # print(nparse,ret) + if option: - # print(f'parse:{parse}=>(option:{option})') - if ':eq(-1)' in parse: - # 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq - ret = doc(parse.replace(':eq(-1)','')).eq(-1) - else: - ret = doc(parse) - # print(html) - # FIXME 解析出来有多个的情况应该自动取第一个 if option == 'Text': ret = ret.text() elif option == 'Html': ret = ret.html() else: ret = ret.attr(option) or '' - if option.lower().find('style')>-1 and ret.find('url(')>-1: + if option.lower().find('style') > -1 and ret.find('url(') > -1: try: - ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0] + ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0] except: pass if ret and add_url: - # pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|') - # need_add = option in pd_list - - need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I) - # print(f'option:{option},need_add:{need_add}') + need_add = re.search(URLJOIN_ATTR, option, re.M | re.I) if need_add: if 'http' in ret: ret = ret[ret.find('http'):] else: - ret = urljoin(self.MY_URL,ret) - # print(ret) + ret = urljoin(self.MY_URL, ret) else: - # ret = doc(parse+':first') - ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next - # ret = ret.next() # 取第一条数据 - # ret = doc(parse) # 下面注释的写法不对的 - # ret = ret.find(':first') - # ret = ret.children(':first') - # print(parse) - # ret = str(ret) ret = ret.outerHtml() return ret - def pdfa(self,html,parse:str): + def parseOneRule(self, doc, nparse, ret=None): + """ + 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret + :param doc: pq(html) load 后的pq对象 + :param nparse: 当前单个解析表达式 + :param ret: pd对象结果 + :return: + """ + if self.test(':eq', nparse): + nparse_rule = nparse.split(':eq')[0] + nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0] + try: + nparse_index = int(nparse_index) + except: + nparse_index = 0 + if not ret: + ret = doc(nparse_rule).eq(nparse_index) + else: + ret = ret(nparse_rule) + else: + if not ret: + ret = doc(nparse) + else: + ret = ret(nparse) + return ret + + def parseHikerToJq(self, parse, first=False): + """ + 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) + :param parse: + :param first: + :return: + """ + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + new_parses = [] # 构造新的解析表达式列表 + for i in range(len(parse)): + ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素 + if not self.test(NOADD_INDEX, ps): + if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0) + new_parses.append(parse[i]) + else: + new_parses.append(f'{parse[i]}:eq(0)') + else: + new_parses.append(parse[i]) + parse = ' '.join(new_parses) + return parse + + def pdfa(self, html, parse: str): # 看官方文档才能解决这个问题!!! # https://pyquery.readthedocs.io/en/latest/api.html - if not parse: + if not all([html, parse]): return [] - if parse.find('&&') > -1: - parse = parse.split('&&') # 带&&的重新拼接 - # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") - parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = self.parseHikerToJq(parse) print(f'pdfa:{parse}') - # print(html) if PARSE_CACHE: if self.pdfa_html != html: self.pdfa_html = html @@ -114,32 +149,29 @@ class jsoup: doc = self.pdfa_doc else: doc = pq(html) - result = doc(parse) - # 节点转字符串 - # print(str(etree.tostring(result[0], pretty_print=True), 'utf-8')) - # res = [item for item in result.items()] - # print(res) - res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误 - # res = [str(item) for item in result.items()] - # res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result] - # print(len(res),res) - # print('pdfa执行结果数:',len(res)) + + parses = parse.split(' ') + ret = None + for nparse in parses: + ret = self.parseOneRule(doc, nparse, ret) + # print(len(ret),nparse) + res = [item.outerHtml() for item in ret.items()] return res - def pd(self,html,parse:str): - return self.pdfh(html,parse,True) + def pd(self, html, parse: str): + return self.pdfh(html, parse, True) - def pq(self,html:str): + def pq(self, html: str): return pq(html) - def pjfh(self,html,parse:str,add_url=False): - if not parse: + def pjfh(self, html, parse: str, add_url=False): + if not all([html, parse]): return '' - if isinstance(html,str): + if isinstance(html, str): # print(html) try: - html = json.loads(html) - # html = eval(html) + html = ujson.loads(html) + # html = eval(html) except: print('字符串转json失败') return '' @@ -147,8 +179,8 @@ class jsoup: parse = f'$.{parse}' ret = '' for ps in parse.split('||'): - ret = jsonpath(html,ps) - if isinstance(ret,list): + ret = jsonpath(html, ps) + if isinstance(ret, list): ret = str(ret[0]) if ret[0] else '' else: ret = str(ret) if ret else '' @@ -159,52 +191,31 @@ class jsoup: # print(ret) return ret - def pj(self, html, parse:str): + def pj(self, html, parse: str): return self.pjfh(html, parse, True) - def pjfa(self,html,parse:str): - if not parse: + def pjfa(self, html, parse: str): + if not all([html, parse]): return [] - if isinstance(html,str): + if isinstance(html, str): try: - html = json.loads(html) + html = ujson.loads(html) except: - return '' + return [] if not parse.startswith('$.'): parse = f'$.{parse}' # print(html) # print(parse) - ret = jsonpath(html,parse) + ret = jsonpath(html, parse) # print(ret) # print(type(ret)) # print(type(ret[0])) # print(len(ret)) - if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1: + if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1: # print('自动解包') - ret = ret[0] # 自动解包 + ret = ret[0] # 自动解包 return ret or [] -if __name__ == '__main__': - import requests - from parsel import Selector - url = 'http://360yy.cn' - jsp = jsoup(url) - def pdfa2(html,parse): - if not parse: - return [] - if parse.find('&&') > -1: - parse = parse.split('&&') # 带&&的重新拼接 - # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") - # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) - parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) - # print(f'pdfa:{parse}') - selector = Selector(text=html) - print(parse) - items = selector.css(parse) - return [str(item) for item in items] - r = requests.get(url) - html = r.text - # parsel 不好用啊,很难实现封装pdfa之类的函数 - items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') - print(items) +if __name__ == '__main__': + pass \ No newline at end of file -- GitLab