From b633456159a60b06417a5a52d932d542114573ad Mon Sep 17 00:00:00 2001
From: hjdhnx <hjd124579>
Date: Thu, 1 Sep 2022 15:00:17 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E4=BA=86=E5=8F=A6=E5=A4=96?=
 =?UTF-8?q?=E7=9A=84parsel=E8=A7=A3=E6=9E=90=E5=BA=93?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 txt/issue.txt       |  3 +++
 txt/libs.txt        |  3 ++-
 utils/htmlParser.py | 27 ++++++++++++++++++++++++++-
 3 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/txt/issue.txt b/txt/issue.txt
index efb3086..ec9d799 100644
--- a/txt/issue.txt
+++ b/txt/issue.txt
@@ -13,6 +13,9 @@ assert subprocess.call(
         shell=True,
         cwd=DIRNAME) == 0, 'Could not link required node_modules'
 
+另外一个很强的css解析库,性能待验证parsel
+https://cuiqingcai.com/202232.html
+
 远程直播地址:
 "lives":[{"group":"redirect","channels":[{"name":"直播","urls":["proxy://do=live&type=txt&ext=aHR0cHM6Ly9hZ2l0LmFpL2xjeC8xMS9yYXcvYnJhbmNoL21hc3Rlci9saXZl"]}]}],
 下面格式原版tv_box才能用?
diff --git a/txt/libs.txt b/txt/libs.txt
index 6b2dba4..5f34e1e 100644
--- a/txt/libs.txt
+++ b/txt/libs.txt
@@ -1,3 +1,4 @@
 gevent-websocket
 PyExecJS
-ddddocr
\ No newline at end of file
+ddddocr
+parsel # 不知道对比pyquery性能如何,也是css选择器,不好封装成pdfa
\ No newline at end of file
diff --git a/utils/htmlParser.py b/utils/htmlParser.py
index 63d9a41..5088ea7 100644
--- a/utils/htmlParser.py
+++ b/utils/htmlParser.py
@@ -68,4 +68,29 @@ class jsoup:
         return self.pdfh(html,parse,True)
 
     def pq(self,html):
-        return pq(html)
\ No newline at end of file
+        return pq(html)
+
+if __name__ == '__main__':
+    import requests
+    from parsel import Selector
+    url = 'http://360yy.cn'
+    jsp = jsoup(url)
+    def pdfa2(html,parse):
+        if not parse:
+            return []
+        if parse.find('&&') > -1:
+            parse = parse.split('&&')  # 带&&的重新拼接
+            # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
+            # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
+            parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
+        # print(f'pdfa:{parse}')
+        selector = Selector(text=html)
+        print(parse)
+        items = selector.css(parse)
+        return [str(item) for item in items]
+    r = requests.get(url)
+    html = r.text
+    # parsel 不好用啊,很难实现封装pdfa之类的函数
+    items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
+    print(items)
+
-- 
GitLab