diff --git a/txt/issue.txt b/txt/issue.txt index efb3086256da9c9c51ef5045ebb4f14c591aa2e1..ec9d799ec9503be12e2f51e2ea1a2799ecc12005 100644 --- a/txt/issue.txt +++ b/txt/issue.txt @@ -13,6 +13,9 @@ assert subprocess.call( shell=True, cwd=DIRNAME) == 0, 'Could not link required node_modules' +另外一个很强的css解析库,性能待验证parsel +https://cuiqingcai.com/202232.html + 远程直播地址: "lives":[{"group":"redirect","channels":[{"name":"直播","urls":["proxy://do=live&type=txt&ext=aHR0cHM6Ly9hZ2l0LmFpL2xjeC8xMS9yYXcvYnJhbmNoL21hc3Rlci9saXZl"]}]}], 下面格式原版tv_box才能用? diff --git a/txt/libs.txt b/txt/libs.txt index 6b2dba42566c4f7a60321311140ae960993f4a83..5f34e1e849f21b0c5901d480d79faa843a1ead5c 100644 --- a/txt/libs.txt +++ b/txt/libs.txt @@ -1,3 +1,4 @@ gevent-websocket PyExecJS -ddddocr \ No newline at end of file +ddddocr +parsel # 不知道对比pyquery性能如何,也是css选择器,不好封装成pdfa \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 63d9a41473603c48ba9ca2829a6a7e9df65e6d1b..5088ea7ff03f2ffcc6cccae4b11286a85eb7665a 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -68,4 +68,29 @@ class jsoup: return self.pdfh(html,parse,True) def pq(self,html): - return pq(html) \ No newline at end of file + return pq(html) + +if __name__ == '__main__': + import requests + from parsel import Selector + url = 'http://360yy.cn' + jsp = jsoup(url) + def pdfa2(html,parse): + if not parse: + return [] + if parse.find('&&') > -1: + parse = parse.split('&&') # 带&&的重新拼接 + # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}") + # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) + parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))]) + # print(f'pdfa:{parse}') + selector = Selector(text=html) + print(parse) + items = selector.css(parse) + return [str(item) for item in items] + r = requests.get(url) + html = r.text + # parsel 不好用啊,很难实现封装pdfa之类的函数 + items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a') + print(items) +