From b633456159a60b06417a5a52d932d542114573ad Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Thu, 1 Sep 2022 15:00:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E4=BA=86=E5=8F=A6=E5=A4=96?= =?UTF-8?q?=E7=9A=84parsel=E8=A7=A3=E6=9E=90=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- txt/issue.txt | 3 +++ txt/libs.txt | 3 ++- utils/htmlParser.py | 27 ++++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/txt/issue.txt b/txt/issue.txt index efb3086..ec9d799 100644 --- a/txt/issue.txt +++ b/txt/issue.txt @@ -13,6 +13,9 @@ assert subprocess.call( shell=True, cwd=DIRNAME) == 0, 'Could not link required node_modules' +另外一个很强的css解析库,性能待验证parsel +https://cuiqingcai.com/202232.html + 远程直播地址: "lives":[{"group":"redirect","channels":[{"name":"直播","urls":["proxy://do=live&type=txt&ext=aHR0cHM6Ly9hZ2l0LmFpL2xjeC8xMS9yYXcvYnJhbmNoL21hc3Rlci9saXZl"]}]}], 下面格式原版tv_box才能用? diff --git a/txt/libs.txt b/txt/libs.txt index 6b2dba4..5f34e1e 100644 --- a/txt/libs.txt +++ b/txt/libs.txt @@ -1,3 +1,4 @@ gevent-websocket PyExecJS -ddddocr \ No newline at end of file +ddddocr +parsel # 不知道对比pyquery性能如何,也是css选择器,不好封装成pdfa \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 63d9a41..5088ea7 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -68,4 +68,29 @@ class jsoup: return self.pdfh(html,parse,True) def pq(self,html): - return pq(html) \ No newline at end of file + return pq(html) + +if __name__ == '__main__': + import requests + from parsel import Selector + url = 'http://360yy.cn' + jsp = jsoup(url) + def pdfa2(html,parse): + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) + # print(f'pdfa:{parse}') + selector = Selector(text=html) + print(parse) + items = selector.css(parse) + return [str(item) for item in items] + r = requests.get(url) + html = r.text + # parsel 不好用啊,很难实现封装pdfa之类的函数 + items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') + print(items) + -- GitLab