From 41972dec8eda1e681b0eef81bc35c7ffd54e80ac Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Wed, 16 Nov 2022 14:30:50 +0800 Subject: [PATCH] =?UTF-8?q?js0=20pdfh=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/rules.db | Bin 155648 -> 155648 bytes js/version.txt | 2 +- utils/htmlParser.py | 7 ++++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/base/rules.db b/base/rules.db index e671074eb342cab903708e871ce0a3f8e487b4b9..47af86169616a5dcea3631bbe6b2031d5bcc3545 100644 GIT binary patch delta 25 hcmZoTz}awsbAmMEtcfztjI$aOS`!$zCNM3C2LO8^2|)k= delta 25 hcmZoTz}awsbAmMEjEOSNj58V&S`!$zCNM3C2LO8O2|WM+ diff --git a/js/version.txt b/js/version.txt index dcd32c1..00e897b 100644 --- a/js/version.txt +++ b/js/version.txt @@ -1 +1 @@ -3.8.3 \ No newline at end of file +3.8.2 \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index b1df739..23ac23e 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -11,7 +11,7 @@ import re from jsonpath import jsonpath PARSE_CACHE = True # 解析缓存 -NOADD_INDEX = ':eq|:lt|:gt|^body$|^#' # 不自动加eq下标索引 +NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引 URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url)$' # 需要自动urljoin的属性 @@ -134,6 +134,11 @@ class jsoup: else: new_parses.append(parse[i]) parse = ' '.join(new_parses) + else: + ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素 + if not self.test(NOADD_INDEX, ps) and first: + parse = f'{parse}:eq(0)' + return parse def pdfa(self, html, parse: str): -- GitLab