From db1394dcc4d8bf7066f01f80db80fe8e046e2e19 Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Thu, 17 Nov 2022 17:24:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=A1=88=E4=BE=8B=E5=AE=8C?= =?UTF-8?q?=E7=BE=8E=E9=80=9A=E8=BF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- js/version.txt | 2 +- "py/\346\265\213\350\257\225pdf.py" | 10 ++++++---- utils/htmlParser.py | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/js/version.txt b/js/version.txt index ae94c93..63c0213 100644 --- a/js/version.txt +++ b/js/version.txt @@ -1 +1 @@ -3.7.6 \ No newline at end of file +3.7.5beta10 \ No newline at end of file diff --git "a/py/\346\265\213\350\257\225pdf.py" "b/py/\346\265\213\350\257\225pdf.py" index 9ad060f..04386f9 100644 --- "a/py/\346\265\213\350\257\225pdf.py" +++ "b/py/\346\265\213\350\257\225pdf.py" @@ -29,8 +29,10 @@ def main(): print(a) a = jsp.pdfh(lis[0], 'a:eq(1) li img') print(a) - a = jsp.pd(lis[0], 'a&&li&&img&&src') - print(a) + a = jsp.pd(lis[0], 'a:eq(1)&&li&&img&&src') + print('src:',a) + a = jsp.pd(lis[0], 'a&&href') + print('href:', a) def main1(): url = 'https://www.lanhua.tv/voddetail/7420.html' @@ -89,7 +91,7 @@ def main3(): print(a) if __name__ == '__main__': - # main() + main() # main1() # main2() - main3() \ No newline at end of file + # main3() \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 9bf4fdb..0f028f2 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -160,7 +160,7 @@ class jsoup: res = [item.outerHtml() for item in ret.items()] return res - def pdfh(self, html, parse: str, add_url=False): + def pdfh(self, html, parse: str, add_url=False, base_url: str = ''): if not all([html, parse]): return '' if PARSE_CACHE: @@ -210,13 +210,15 @@ class jsoup: if 'http' in ret: ret = ret[ret.find('http'):] else: - ret = urljoin(self.MY_URL, ret) + if not base_url: + base_url = self.MY_URL + ret = urljoin(base_url, ret) else: ret = ret.outerHtml() return ret - def pd(self, html, parse: str): - return self.pdfh(html, parse, True) + def pd(self, html, parse: str, base_url: str = ''): + return self.pdfh(html, parse, True, base_url) def pq(self, html: str): return pq(html) -- GitLab