diff --git a/base/rules.db b/base/rules.db index 0e258cc744ce58a58e3a36db4711252258b84def..8d21abea992a12d4ec8377e78c15cf25b9bd077f 100644 Binary files a/base/rules.db and b/base/rules.db differ diff --git a/js/version.txt b/js/version.txt index 596757a1ba6f53fe7f5926924d4f1c1bb5936a4a..5553d8c45a922ca3f0d077fd91340ecaa5a1c438 100644 --- a/js/version.txt +++ b/js/version.txt @@ -1 +1 @@ -3.9.20beta8 \ No newline at end of file +3.8.8 \ No newline at end of file diff --git "a/js/\347\234\213\350\247\206\347\225\214.js" "b/js/\347\234\213\350\247\206\347\225\214.js" index bafa3ad4f2548f9a9f06f21bba01041f1a7d08d5..ce668a5e339cf31f20663ff6ae5c93455ba5841e 100644 --- "a/js/\347\234\213\350\247\206\347\225\214.js" +++ "b/js/\347\234\213\350\247\206\347\225\214.js" @@ -1,4 +1,5 @@ var rule = Object.assign(muban.mxone5,{ title:'看视界', host:'https://www.1080kan.cc', +headers:{'User-Agent':'MOBILE_UA'}, }); \ No newline at end of file diff --git a/utils/htmlParseerOld.py b/utils/htmlParseerOld.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae0a09882615ce0894f0f2bbbf486241914069f --- /dev/null +++ b/utils/htmlParseerOld.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# File : htmlParser.py +# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ +# Date : 2022/8/25 +import json + +from pyquery import PyQuery as pq +from lxml import etree +from urllib.parse import urljoin +import re +from jsonpath import jsonpath + +PARSE_CACHE = True # 解析缓存 + +class jsoup: + def __init__(self,MY_URL=''): + self.MY_URL = MY_URL + self.pdfh_html = '' + self.pdfa_html = '' + + self.pdfh_doc = None + self.pdfa_doc = None + + def test(self, text:str, string:str): + searchObj = re.search(rf'{text}', string, re.M | re.I) + test_ret = True if searchObj else False + return test_ret + + def pdfh(self,html,parse:str,add_url=False): + if not parse: + return '' + if PARSE_CACHE: + if self.pdfh_html != html: + self.pdfh_html = html + self.pdfh_doc = pq(html) + doc = self.pdfh_doc + else: + doc = pq(html) + if parse == 'body&&Text' or parse == 'Text': + text = doc.text() + return text + elif parse == 'body&&Html' or parse == 'Html': + return doc.html() + option = None + if parse.find('&&') > -1: + option = parse.split('&&')[-1] + parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0 + if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq + parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse]) + else: + parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)' + # FIXME 暂时不支持jsonpath那样的|| 分割取或属性 + if option: + # print(f'parse:{parse}=>(option:{option})') + if ':eq(-1)' in parse: + # 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq + ret = doc(parse.replace(':eq(-1)','')).eq(-1) + else: + ret = doc(parse) + # print(html) + # FIXME 解析出来有多个的情况应该自动取第一个 + if option == 'Text': + ret = ret.text() + elif option == 'Html': + ret = ret.html() + else: + ret = ret.attr(option) or '' + if option.lower().find('style')>-1 and ret.find('url(')>-1: + try: + ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0] + except: + pass + + if ret and add_url: + # pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|') + # need_add = option in pd_list + + need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I) + # print(f'option:{option},need_add:{need_add}') + if need_add: + if 'http' in ret: + ret = ret[ret.find('http'):] + else: + ret = urljoin(self.MY_URL,ret) + # print(ret) + else: + # ret = doc(parse+':first') + ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next + # ret = ret.next() # 取第一条数据 + # ret = doc(parse) # 下面注释的写法不对的 + # ret = ret.find(':first') + # ret = ret.children(':first') + # print(parse) + # ret = str(ret) + ret = ret.outerHtml() + return ret + + def pdfa(self,html,parse:str): + # 看官方文档才能解决这个问题!!! + # https://pyquery.readthedocs.io/en/latest/api.html + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + print(f'pdfa:{parse}') + # print(html) + if PARSE_CACHE: + if self.pdfa_html != html: + self.pdfa_html = html + self.pdfa_doc = pq(html) + doc = self.pdfa_doc + else: + doc = pq(html) + result = doc(parse) + # 节点转字符串 + # print(str(etree.tostring(result[0], pretty_print=True), 'utf-8')) + # res = [item for item in result.items()] + # print(res) + res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误 + # res = [str(item) for item in result.items()] + # res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result] + # print(len(res),res) + # print('pdfa执行结果数:',len(res)) + return res + + def pd(self,html,parse:str): + return self.pdfh(html,parse,True) + + def pq(self,html:str): + return pq(html) + + def pjfh(self,html,parse:str,add_url=False): + if not parse: + return '' + if isinstance(html,str): + # print(html) + try: + html = json.loads(html) + # html = eval(html) + except: + print('字符串转json失败') + return '' + if not parse.startswith('$.'): + parse = f'$.{parse}' + ret = '' + for ps in parse.split('||'): + ret = jsonpath(html,ps) + if isinstance(ret,list): + ret = str(ret[0]) if ret[0] else '' + else: + ret = str(ret) if ret else '' + if add_url and ret: + ret = urljoin(self.MY_URL, ret) + if ret: + break + # print(ret) + return ret + + def pj(self, html, parse:str): + return self.pjfh(html, parse, True) + + def pjfa(self,html,parse:str): + if not parse: + return [] + if isinstance(html,str): + try: + html = json.loads(html) + except: + return '' + if not parse.startswith('$.'): + parse = f'$.{parse}' + # print(html) + # print(parse) + ret = jsonpath(html,parse) + # print(ret) + # print(type(ret)) + # print(type(ret[0])) + # print(len(ret)) + if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1: + # print('自动解包') + ret = ret[0] # 自动解包 + return ret or [] + +if __name__ == '__main__': + import requests + from parsel import Selector + url = 'http://360yy.cn' + jsp = jsoup(url) + def pdfa2(html,parse): + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) + # print(f'pdfa:{parse}') + selector = Selector(text=html) + print(parse) + items = selector.css(parse) + return [str(item) for item in items] + r = requests.get(url) + html = r.text + # parsel 不好用啊,很难实现封装pdfa之类的函数 + items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') + print(items) + diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 6ae0a09882615ce0894f0f2bbbf486241914069f..4012e31be1bae880d3cc17d821309b7e5303c5ff 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -3,18 +3,20 @@ # File : htmlParser.py # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ # Date : 2022/8/25 -import json +import ujson from pyquery import PyQuery as pq -from lxml import etree from urllib.parse import urljoin import re from jsonpath import jsonpath PARSE_CACHE = True # 解析缓存 +NOADD_INDEX = ':eq|:lt|:gt|^body$|^#' # 不自动加eq下标索引 +URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性 + class jsoup: - def __init__(self,MY_URL=''): + def __init__(self, MY_URL=''): self.MY_URL = MY_URL self.pdfh_html = '' self.pdfa_html = '' @@ -22,13 +24,19 @@ class jsoup: self.pdfh_doc = None self.pdfa_doc = None - def test(self, text:str, string:str): + def test(self, text: str, string: str): + """ + 正则判断字符串包含,模仿js的 //.test() + :param text: + :param string: + :return: + """ searchObj = re.search(rf'{text}', string, re.M | re.I) test_ret = True if searchObj else False return test_ret - def pdfh(self,html,parse:str,add_url=False): - if not parse: + def pdfh(self, html, parse: str, add_url=False): + if not all([html, parse]): return '' if PARSE_CACHE: if self.pdfh_html != html: @@ -42,71 +50,98 @@ class jsoup: return text elif parse == 'body&&Html' or parse == 'Html': return doc.html() + option = None if parse.find('&&') > -1: option = parse.split('&&')[-1] - parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0 - if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq - parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse]) - else: - parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)' - # FIXME 暂时不支持jsonpath那样的|| 分割取或属性 + parse = '&&'.join(parse.split('&&')[:-1]) + parse = self.parseHikerToJq(parse, True) + # print(f'pdfh:{parse},option:{option}') + parses = parse.split(' ') + ret = None + for nparse in parses: + ret = self.parseOneRule(doc, nparse, ret) + # print(nparse,ret) + if option: - # print(f'parse:{parse}=>(option:{option})') - if ':eq(-1)' in parse: - # 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq - ret = doc(parse.replace(':eq(-1)','')).eq(-1) - else: - ret = doc(parse) - # print(html) - # FIXME 解析出来有多个的情况应该自动取第一个 if option == 'Text': ret = ret.text() elif option == 'Html': ret = ret.html() else: ret = ret.attr(option) or '' - if option.lower().find('style')>-1 and ret.find('url(')>-1: + if option.lower().find('style') > -1 and ret.find('url(') > -1: try: - ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0] + ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0] except: pass if ret and add_url: - # pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|') - # need_add = option in pd_list - - need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I) - # print(f'option:{option},need_add:{need_add}') + need_add = re.search(URLJOIN_ATTR, option, re.M | re.I) if need_add: if 'http' in ret: ret = ret[ret.find('http'):] else: - ret = urljoin(self.MY_URL,ret) - # print(ret) + ret = urljoin(self.MY_URL, ret) else: - # ret = doc(parse+':first') - ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next - # ret = ret.next() # 取第一条数据 - # ret = doc(parse) # 下面注释的写法不对的 - # ret = ret.find(':first') - # ret = ret.children(':first') - # print(parse) - # ret = str(ret) ret = ret.outerHtml() return ret - def pdfa(self,html,parse:str): + def parseOneRule(self, doc, nparse, ret=None): + """ + 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret + :param doc: pq(html) load 后的pq对象 + :param nparse: 当前单个解析表达式 + :param ret: pd对象结果 + :return: + """ + if self.test(':eq', nparse): + nparse_rule = nparse.split(':eq')[0] + nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0] + try: + nparse_index = int(nparse_index) + except: + nparse_index = 0 + if not ret: + ret = doc(nparse_rule).eq(nparse_index) + else: + ret = ret(nparse_rule) + else: + if not ret: + ret = doc(nparse) + else: + ret = ret(nparse) + return ret + + def parseHikerToJq(self, parse, first=False): + """ + 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) + :param parse: + :param first: + :return: + """ + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + new_parses = [] # 构造新的解析表达式列表 + for i in range(len(parse)): + ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素 + if not self.test(NOADD_INDEX, ps): + if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0) + new_parses.append(parse[i]) + else: + new_parses.append(f'{parse[i]}:eq(0)') + else: + new_parses.append(parse[i]) + parse = ' '.join(new_parses) + return parse + + def pdfa(self, html, parse: str): # 看官方文档才能解决这个问题!!! # https://pyquery.readthedocs.io/en/latest/api.html - if not parse: + if not all([html, parse]): return [] - if parse.find('&&') > -1: - parse = parse.split('&&') # 带&&的重新拼接 - # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") - parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = self.parseHikerToJq(parse) print(f'pdfa:{parse}') - # print(html) if PARSE_CACHE: if self.pdfa_html != html: self.pdfa_html = html @@ -114,32 +149,29 @@ class jsoup: doc = self.pdfa_doc else: doc = pq(html) - result = doc(parse) - # 节点转字符串 - # print(str(etree.tostring(result[0], pretty_print=True), 'utf-8')) - # res = [item for item in result.items()] - # print(res) - res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误 - # res = [str(item) for item in result.items()] - # res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result] - # print(len(res),res) - # print('pdfa执行结果数:',len(res)) + + parses = parse.split(' ') + ret = None + for nparse in parses: + ret = self.parseOneRule(doc, nparse, ret) + # print(len(ret),nparse) + res = [item.outerHtml() for item in ret.items()] return res - def pd(self,html,parse:str): - return self.pdfh(html,parse,True) + def pd(self, html, parse: str): + return self.pdfh(html, parse, True) - def pq(self,html:str): + def pq(self, html: str): return pq(html) - def pjfh(self,html,parse:str,add_url=False): - if not parse: + def pjfh(self, html, parse: str, add_url=False): + if not all([html, parse]): return '' - if isinstance(html,str): + if isinstance(html, str): # print(html) try: - html = json.loads(html) - # html = eval(html) + html = ujson.loads(html) + # html = eval(html) except: print('字符串转json失败') return '' @@ -147,8 +179,8 @@ class jsoup: parse = f'$.{parse}' ret = '' for ps in parse.split('||'): - ret = jsonpath(html,ps) - if isinstance(ret,list): + ret = jsonpath(html, ps) + if isinstance(ret, list): ret = str(ret[0]) if ret[0] else '' else: ret = str(ret) if ret else '' @@ -159,52 +191,31 @@ class jsoup: # print(ret) return ret - def pj(self, html, parse:str): + def pj(self, html, parse: str): return self.pjfh(html, parse, True) - def pjfa(self,html,parse:str): - if not parse: + def pjfa(self, html, parse: str): + if not all([html, parse]): return [] - if isinstance(html,str): + if isinstance(html, str): try: - html = json.loads(html) + html = ujson.loads(html) except: - return '' + return [] if not parse.startswith('$.'): parse = f'$.{parse}' # print(html) # print(parse) - ret = jsonpath(html,parse) + ret = jsonpath(html, parse) # print(ret) # print(type(ret)) # print(type(ret[0])) # print(len(ret)) - if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1: + if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1: # print('自动解包') - ret = ret[0] # 自动解包 + ret = ret[0] # 自动解包 return ret or [] -if __name__ == '__main__': - import requests - from parsel import Selector - url = 'http://360yy.cn' - jsp = jsoup(url) - def pdfa2(html,parse): - if not parse: - return [] - if parse.find('&&') > -1: - parse = parse.split('&&') # 带&&的重新拼接 - # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") - # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) - parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) - # print(f'pdfa:{parse}') - selector = Selector(text=html) - print(parse) - items = selector.css(parse) - return [str(item) for item in items] - r = requests.get(url) - html = r.text - # parsel 不好用啊,很难实现封装pdfa之类的函数 - items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') - print(items) +if __name__ == '__main__': + pass \ No newline at end of file