diff --git a/base/rules.db b/base/rules.db
index 47af86169616a5dcea3631bbe6b2031d5bcc3545..3b77166fef82c9914c703664b0bfd7cd2f0a2947 100644
Binary files a/base/rules.db and b/base/rules.db differ
diff --git "a/py/\346\265\213\350\257\225pdf.py" "b/py/\346\265\213\350\257\225pdf.py"
index 55338d2d6ad48f07e3d19f1334d70c43f8cd0f8e..9ad060f382dab5864b4f8ef5a702ad9f2d0dec22 100644
--- "a/py/\346\265\213\350\257\225pdf.py"
+++ "b/py/\346\265\213\350\257\225pdf.py"
@@ -65,7 +65,31 @@ def main2():
a = jsp.pdfa(html, 'h1')
print(a)
+def main3():
+ html = """
+
+
内容1我不获取的内容1我不获取的内容2内容2
+
+ """
+ jsp = jsoup('https://www.cnblogs.com/lizhibk/p/8623543.html')
+ a = jsp.pdfh(html, 'div p:eq(0)--span&&Text')
+ print(a)
+ a = jsp.pdfh(html,'div p--span&&Text')
+ print(a)
+ a = jsp.pdfh(html, 'div p:eq(0)--#exd1&&Text')
+ print(a)
+ a = jsp.pdfh(html, 'div p:eq(0)--#exd2&&Text')
+ print(a)
+ a = jsp.pdfh(html, 'div p:eq(0)--#exd2--#exd1&&Text')
+ print(a)
+ # a = jsp.pdfh(html, 'div p--#exd1&&Text')
+ a = jsp.pdfh(html, 'div p--#exd1')
+ print(a)
+ a = jsp.pdfh(html, 'div p:first--#exd1')
+ print(a)
+
if __name__ == '__main__':
# main()
# main1()
- main2()
\ No newline at end of file
+ # main2()
+ main3()
\ No newline at end of file
diff --git a/utils/htmlParser.py b/utils/htmlParser.py
index 23ac23e0f14f6715baff2652a4d55607c998d6fe..9d9370f6e04c9a14de7b160467bc7a75c3efb4b2 100644
--- a/utils/htmlParser.py
+++ b/utils/htmlParser.py
@@ -3,6 +3,7 @@
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
+# upDate : 2022/11/17 支持 -- 剔除元素 多个剔除
import ujson
from pyquery import PyQuery as pq
@@ -14,7 +15,6 @@ PARSE_CACHE = True # 解析缓存
NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性
-
class jsoup:
def __init__(self, MY_URL=''):
self.MY_URL = MY_URL
@@ -35,6 +35,126 @@ class jsoup:
test_ret = True if searchObj else False
return test_ret
+ def parseHikerToJq(self, parse, first=False):
+ """
+ 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
+ :param parse:
+ :param first:
+ :return:
+ """
+ if parse.find('&&') > -1:
+ parse = parse.split('&&') # 带&&的重新拼接
+ new_parses = [] # 构造新的解析表达式列表
+ for i in range(len(parse)):
+ ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
+ if not self.test(NOADD_INDEX, ps):
+ if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
+ new_parses.append(parse[i])
+ else:
+ new_parses.append(f'{parse[i]}:eq(0)')
+ else:
+ new_parses.append(parse[i])
+ parse = ' '.join(new_parses)
+ else:
+ ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素
+ if not self.test(NOADD_INDEX, ps) and first:
+ parse = f'{parse}:eq(0)'
+
+ return parse
+
+ def getParseInfo(self,nparse):
+ """
+ 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
+ :param nparse:
+ :return:
+ """
+ excludes = [] # 定义排除列表默认值为空
+ nparse_index = 0 # 定义位置索引默认值为0
+ nparse_rule = nparse # 定义规则默认值为本身
+ if self.test(':eq', nparse):
+ nparse_rule = nparse.split(':eq')[0]
+ nparse_pos = nparse.split(':eq')[1]
+ # print(nparse_rule)
+ if self.test('--', nparse_rule):
+ excludes = nparse_rule.split('--')[1:]
+ nparse_rule = nparse_rule.split('--')[0]
+ elif self.test('--', nparse_pos):
+ excludes = nparse_pos.split('--')[1:]
+ nparse_pos = nparse_pos.split('--')[0]
+ try:
+ nparse_index = nparse_pos.split('(')[1].split(')')[0]
+ nparse_index = int(nparse_index)
+ except:
+ nparse_index = 0
+ if nparse_index > 0:
+ print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
+ return nparse_rule,nparse_index,excludes
+ else:
+ if self.test('--', nparse):
+ nparse_rule = nparse.split('--')[0]
+ excludes = nparse.split('--')[1:]
+ # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
+ return nparse_rule, nparse_index, excludes
+
+ def parseOneRule(self, doc, nparse, ret=None):
+ """
+ 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
+ :param doc: pq(html) load 后的pq对象
+ :param nparse: 当前单个解析表达式
+ :param ret: pd对象结果
+ :return:
+ """
+ if self.test(':eq', nparse):
+ nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
+ if not ret:
+ ret = doc(nparse_rule).eq(nparse_index)
+ # if nparse_index > 4:
+ # print('1nparse_index',ret,not ret)
+ else:
+ ret = ret(nparse_rule).eq(nparse_index)
+ # if nparse_index > 4:
+ # print('2nparse_index',ret)
+ if excludes and ret:
+ ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存
+ for exclude in excludes:
+ ret.remove(exclude)
+ else:
+ nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
+ if not ret:
+ ret = doc(nparse_rule)
+ else:
+ ret = ret(nparse_rule)
+ if excludes and ret:
+ ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存
+ for exclude in excludes:
+ ret.remove(exclude)
+ return ret
+
+ def pdfa(self, html, parse: str):
+ # 看官方文档才能解决这个问题!!!
+ # https://pyquery.readthedocs.io/en/latest/api.html
+ if not all([html, parse]):
+ return []
+ parse = self.parseHikerToJq(parse)
+ print(f'pdfa:{parse}')
+ if PARSE_CACHE:
+ if self.pdfa_html != html:
+ self.pdfa_html = html
+ self.pdfa_doc = pq(html)
+ doc = self.pdfa_doc
+ else:
+ doc = pq(html)
+
+ parses = parse.split(' ')
+ # print(parses)
+ ret = None
+ for nparse in parses:
+ ret = self.parseOneRule(doc, nparse, ret)
+ if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfa直接返回空列表
+ return []
+ res = [item.outerHtml() for item in ret.items()]
+ return res
+
def pdfh(self, html, parse: str, add_url=False):
if not all([html, parse]):
return ''
@@ -58,10 +178,13 @@ class jsoup:
parse = self.parseHikerToJq(parse, True)
# print(f'pdfh:{parse},option:{option}')
parses = parse.split(' ')
+ # print(parses)
ret = None
for nparse in parses:
ret = self.parseOneRule(doc, nparse, ret)
# print(nparse,ret)
+ if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfh直接返回空字符串
+ return ''
if option:
if option == 'Text':
@@ -87,83 +210,6 @@ class jsoup:
ret = ret.outerHtml()
return ret
- def parseOneRule(self, doc, nparse, ret=None):
- """
- 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
- :param doc: pq(html) load 后的pq对象
- :param nparse: 当前单个解析表达式
- :param ret: pd对象结果
- :return:
- """
- if self.test(':eq', nparse):
- nparse_rule = nparse.split(':eq')[0]
- nparse_index = nparse.split(':eq')[1].split('(')[1].split(')')[0]
- try:
- nparse_index = int(nparse_index)
- except:
- nparse_index = 0
- # print(nparse_index)
- if not ret:
- ret = doc(nparse_rule).eq(nparse_index)
- else:
- ret = ret(nparse_rule).eq(nparse_index)
- else:
- if not ret:
- ret = doc(nparse)
- else:
- ret = ret(nparse)
- return ret
-
- def parseHikerToJq(self, parse, first=False):
- """
- 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
- :param parse:
- :param first:
- :return:
- """
- if parse.find('&&') > -1:
- parse = parse.split('&&') # 带&&的重新拼接
- new_parses = [] # 构造新的解析表达式列表
- for i in range(len(parse)):
- ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
- if not self.test(NOADD_INDEX, ps):
- if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
- new_parses.append(parse[i])
- else:
- new_parses.append(f'{parse[i]}:eq(0)')
- else:
- new_parses.append(parse[i])
- parse = ' '.join(new_parses)
- else:
- ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素
- if not self.test(NOADD_INDEX, ps) and first:
- parse = f'{parse}:eq(0)'
-
- return parse
-
- def pdfa(self, html, parse: str):
- # 看官方文档才能解决这个问题!!!
- # https://pyquery.readthedocs.io/en/latest/api.html
- if not all([html, parse]):
- return []
- parse = self.parseHikerToJq(parse)
- print(f'pdfa:{parse}')
- if PARSE_CACHE:
- if self.pdfa_html != html:
- self.pdfa_html = html
- self.pdfa_doc = pq(html)
- doc = self.pdfa_doc
- else:
- doc = pq(html)
-
- parses = parse.split(' ')
- ret = None
- for nparse in parses:
- ret = self.parseOneRule(doc, nparse, ret)
- # print(len(ret),nparse)
- res = [item.outerHtml() for item in ret.items()]
- return res
-
def pd(self, html, parse: str):
return self.pdfh(html, parse, True)