From a9d6317a7d67eda5a8a356f6c4aa0e27847da6ad Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Thu, 17 Nov 2022 10:24:01 +0800 Subject: [PATCH] =?UTF-8?q?js0=E8=AF=AD=E6=B3=95=E4=BC=98=E5=8C=96,?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=85=83=E7=B4=A0=E6=8E=92=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/rules.db | Bin 155648 -> 155648 bytes "py/\346\265\213\350\257\225pdf.py" | 26 +++- utils/htmlParser.py | 202 +++++++++++++++++----------- 3 files changed, 149 insertions(+), 79 deletions(-) diff --git a/base/rules.db b/base/rules.db index 47af86169616a5dcea3631bbe6b2031d5bcc3545..3b77166fef82c9914c703664b0bfd7cd2f0a2947 100644 GIT binary patch delta 30 mcmZoTz}awsbAmME?1?hYjI%c;Ea+x5XlCr$&e+5F|1towObhw| delta 30 mcmZoTz}awsbAmMEtcfztjI%Z-Ea+x5Y-a4)&e+5F|1towF$?(s diff --git "a/py/\346\265\213\350\257\225pdf.py" "b/py/\346\265\213\350\257\225pdf.py" index 55338d2..9ad060f 100644 --- "a/py/\346\265\213\350\257\225pdf.py" +++ "b/py/\346\265\213\350\257\225pdf.py" @@ -65,7 +65,31 @@ def main2(): a = jsp.pdfa(html, 'h1') print(a) +def main3(): + html = """ +
+

内容1我不获取的内容1我不获取的内容2内容2

+
+ """ + jsp = jsoup('https://www.cnblogs.com/lizhibk/p/8623543.html') + a = jsp.pdfh(html, 'div p:eq(0)--span&&Text') + print(a) + a = jsp.pdfh(html,'div p--span&&Text') + print(a) + a = jsp.pdfh(html, 'div p:eq(0)--#exd1&&Text') + print(a) + a = jsp.pdfh(html, 'div p:eq(0)--#exd2&&Text') + print(a) + a = jsp.pdfh(html, 'div p:eq(0)--#exd2--#exd1&&Text') + print(a) + # a = jsp.pdfh(html, 'div p--#exd1&&Text') + a = jsp.pdfh(html, 'div p--#exd1') + print(a) + a = jsp.pdfh(html, 'div p:first--#exd1') + print(a) + if __name__ == '__main__': # main() # main1() - main2() \ No newline at end of file + # main2() + main3() \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 23ac23e..9d9370f 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -3,6 +3,7 @@ # File : htmlParser.py # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------ # Date : 2022/8/25 +# upDate : 2022/11/17 支持 -- 剔除元素 多个剔除 import ujson from pyquery import PyQuery as pq @@ -14,7 +15,6 @@ PARSE_CACHE = True # 解析缓存 NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引 URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性 - class jsoup: def __init__(self, MY_URL=''): self.MY_URL = MY_URL @@ -35,6 +35,126 @@ class jsoup: test_ret = True if searchObj else False return test_ret + def parseHikerToJq(self, parse, first=False): + """ + 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) + :param parse: + :param first: + :return: + """ + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + new_parses = [] # 构造新的解析表达式列表 + for i in range(len(parse)): + ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素 + if not self.test(NOADD_INDEX, ps): + if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0) + new_parses.append(parse[i]) + else: + new_parses.append(f'{parse[i]}:eq(0)') + else: + new_parses.append(parse[i]) + parse = ' '.join(new_parses) + else: + ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素 + if not self.test(NOADD_INDEX, ps) and first: + parse = f'{parse}:eq(0)' + + return parse + + def getParseInfo(self,nparse): + """ + 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作 + :param nparse: + :return: + """ + excludes = [] # 定义排除列表默认值为空 + nparse_index = 0 # 定义位置索引默认值为0 + nparse_rule = nparse # 定义规则默认值为本身 + if self.test(':eq', nparse): + nparse_rule = nparse.split(':eq')[0] + nparse_pos = nparse.split(':eq')[1] + # print(nparse_rule) + if self.test('--', nparse_rule): + excludes = nparse_rule.split('--')[1:] + nparse_rule = nparse_rule.split('--')[0] + elif self.test('--', nparse_pos): + excludes = nparse_pos.split('--')[1:] + nparse_pos = nparse_pos.split('--')[0] + try: + nparse_index = nparse_pos.split('(')[1].split(')')[0] + nparse_index = int(nparse_index) + except: + nparse_index = 0 + if nparse_index > 0: + print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}') + return nparse_rule,nparse_index,excludes + else: + if self.test('--', nparse): + nparse_rule = nparse.split('--')[0] + excludes = nparse.split('--')[1:] + # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}') + return nparse_rule, nparse_index, excludes + + def parseOneRule(self, doc, nparse, ret=None): + """ + 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret + :param doc: pq(html) load 后的pq对象 + :param nparse: 当前单个解析表达式 + :param ret: pd对象结果 + :return: + """ + if self.test(':eq', nparse): + nparse_rule, nparse_index, excludes = self.getParseInfo(nparse) + if not ret: + ret = doc(nparse_rule).eq(nparse_index) + # if nparse_index > 4: + # print('1nparse_index',ret,not ret) + else: + ret = ret(nparse_rule).eq(nparse_index) + # if nparse_index > 4: + # print('2nparse_index',ret) + if excludes and ret: + ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存 + for exclude in excludes: + ret.remove(exclude) + else: + nparse_rule, nparse_index, excludes = self.getParseInfo(nparse) + if not ret: + ret = doc(nparse_rule) + else: + ret = ret(nparse_rule) + if excludes and ret: + ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存 + for exclude in excludes: + ret.remove(exclude) + return ret + + def pdfa(self, html, parse: str): + # 看官方文档才能解决这个问题!!! + # https://pyquery.readthedocs.io/en/latest/api.html + if not all([html, parse]): + return [] + parse = self.parseHikerToJq(parse) + print(f'pdfa:{parse}') + if PARSE_CACHE: + if self.pdfa_html != html: + self.pdfa_html = html + self.pdfa_doc = pq(html) + doc = self.pdfa_doc + else: + doc = pq(html) + + parses = parse.split(' ') + # print(parses) + ret = None + for nparse in parses: + ret = self.parseOneRule(doc, nparse, ret) + if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfa直接返回空列表 + return [] + res = [item.outerHtml() for item in ret.items()] + return res + def pdfh(self, html, parse: str, add_url=False): if not all([html, parse]): return '' @@ -58,10 +178,13 @@ class jsoup: parse = self.parseHikerToJq(parse, True) # print(f'pdfh:{parse},option:{option}') parses = parse.split(' ') + # print(parses) ret = None for nparse in parses: ret = self.parseOneRule(doc, nparse, ret) # print(nparse,ret) + if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfh直接返回空字符串 + return '' if option: if option == 'Text': @@ -87,83 +210,6 @@ class jsoup: ret = ret.outerHtml() return ret - def parseOneRule(self, doc, nparse, ret=None): - """ - 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret - :param doc: pq(html) load 后的pq对象 - :param nparse: 当前单个解析表达式 - :param ret: pd对象结果 - :return: - """ - if self.test(':eq', nparse): - nparse_rule = nparse.split(':eq')[0] - nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0] - try: - nparse_index = int(nparse_index) - except: - nparse_index = 0 - # print(nparse_index) - if not ret: - ret = doc(nparse_rule).eq(nparse_index) - else: - ret = ret(nparse_rule).eq(nparse_index) - else: - if not ret: - ret = doc(nparse) - else: - ret = ret(nparse) - return ret - - def parseHikerToJq(self, parse, first=False): - """ - 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) - :param parse: - :param first: - :return: - """ - if parse.find('&&') > -1: - parse = parse.split('&&') # 带&&的重新拼接 - new_parses = [] # 构造新的解析表达式列表 - for i in range(len(parse)): - ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素 - if not self.test(NOADD_INDEX, ps): - if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0) - new_parses.append(parse[i]) - else: - new_parses.append(f'{parse[i]}:eq(0)') - else: - new_parses.append(parse[i]) - parse = ' '.join(new_parses) - else: - ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素 - if not self.test(NOADD_INDEX, ps) and first: - parse = f'{parse}:eq(0)' - - return parse - - def pdfa(self, html, parse: str): - # 看官方文档才能解决这个问题!!! - # https://pyquery.readthedocs.io/en/latest/api.html - if not all([html, parse]): - return [] - parse = self.parseHikerToJq(parse) - print(f'pdfa:{parse}') - if PARSE_CACHE: - if self.pdfa_html != html: - self.pdfa_html = html - self.pdfa_doc = pq(html) - doc = self.pdfa_doc - else: - doc = pq(html) - - parses = parse.split(' ') - ret = None - for nparse in parses: - ret = self.parseOneRule(doc, nparse, ret) - # print(len(ret),nparse) - res = [item.outerHtml() for item in ret.items()] - return res - def pd(self, html, parse: str): return self.pdfh(html, parse, True) -- GitLab