增加解析去重

优化js0解析器逻辑代码

增加解析去重
优化js0解析器逻辑代码
262dd438 · hjdhnx · 739adc4f · 262dd438 · 262dd438 · 262dd438
显示空白变更内容
内联并排

Showing with 50 addition and 51 deletion

base/rules.db base/rules.db +0 -0

controllers/home.py controllers/home.py +5 -2

py/测试pdf.py py/测试pdf.py +7 -3

utils/htmlParser.py utils/htmlParser.py +38 -46

未找到文件。
--- a/base/rules.db
+++ b/base/rules.db
--- a/controllers/home.py
+++ b/controllers/home.py
@@ -327,10 +327,12 @@ def sort_parses_by_order(parses,host):
    parse_list = parse.query_all()
    parse_url_list = list(map(lambda x: x['url'], parse_list))
    new_parses = []
+    new_parses_url = []
    for i in range(len(parses)):
        # parses[i]['id'] = i + 1
        # 去重
-        if parses[i]['url'] in new_parses:
+        if parses[i]['url'] in new_parses_url:
+            # print(f"重复的解析:{parses[i]['name']},{parses[i]['url']}")
            continue
        if str(parses[i]['url']).startswith(host):
            parses[i]['url'] = parses[i]['url'].replace(host,'')
@@ -349,6 +351,7 @@ def sort_parses_by_order(parses,host):
        if str(parses[i]['url']).startswith('/'):
            parses[i]['url'] = host + parses[i]['url']
        new_parses.append(parses[i])
+        new_parses_url.append(parses[i]['url'])
    new_parses.sort(key=functools.cmp_to_key(comp), reverse=False)
    # print(sites)
    for par in new_parses:
@@ -356,7 +359,7 @@ def sort_parses_by_order(parses,host):
        del par['order']
        del par['write_date']
    # print(new_parses)
-    logger.info(f'{len(parses)}条解析解析排序耗时:{get_interval(t1)}毫秒')
+    logger.info(f'{len(new_parses)}/{len(parses)}条解析解析排序耗时:{get_interval(t1)}毫秒')
    return new_parses

 @home.route('/configs')

--- a/py/测试pdf.py
+++ b/py/测试pdf.py
@@ -90,8 +90,12 @@ def main3():
    a = jsp.pdfh(html, 'div p:first--#exd1')
    print(a)

+    html = requests.get('https://www.leyupro.com/lyd/139451.html').text
+    a = jsp.pdfa(html,'.yunplay&&.downtitle&&ul li')
+    print(a)
+
 if __name__ == '__main__':
-    main()
+    # main()
    # main1()
    # main2()
-    # main3()
\ No newline at end of file
+    main3()
\ No newline at end of file
--- a/utils/htmlParser.py
+++ b/utils/htmlParser.py
@@ -36,6 +36,10 @@ class jsoup:
        test_ret = True if searchObj else False
        return test_ret

+    def contains(self, text: str, match: str):
+        # return match in text
+        return text.find(match) > -1
+
    def parseHikerToJq(self, parse, first=False):
        """
         海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
@@ -43,7 +47,7 @@ class jsoup:
        :param first:
        :return:
        """
-        if parse.find('&&') > -1:
+        if self.contains(parse, '&&'):
            parse = parse.split('&&')  # 带&&的重新拼接
            new_parses = []  # 构造新的解析表达式列表
            for i in range(len(parse)):
@@ -72,28 +76,26 @@ class jsoup:
        excludes = []  # 定义排除列表默认值为空
        nparse_index = 0  # 定义位置索引默认值为0
        nparse_rule = nparse  # 定义规则默认值为本身
-        if self.test(':eq', nparse):
+        if self.contains(nparse, ':eq'):
            nparse_rule = nparse.split(':eq')[0]
            nparse_pos = nparse.split(':eq')[1]
            # print(nparse_rule)
-            if self.test('--', nparse_rule):
+            if self.contains(nparse_rule, '--'):
                excludes = nparse_rule.split('--')[1:]
                nparse_rule = nparse_rule.split('--')[0]
-            elif self.test('--', nparse_pos):
+            elif self.contains(nparse_pos, '--'):
                excludes = nparse_pos.split('--')[1:]
                nparse_pos = nparse_pos.split('--')[0]
            try:
-                nparse_index = nparse_pos.split('(')[1].split(')')[0]
-                nparse_index = int(nparse_index)
+                nparse_index = int(nparse_pos.split('(')[1].split(')')[0])
            except:
-                nparse_index = 0
-            if nparse_index > 0:
-                print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
-            return nparse_rule, nparse_index, excludes
-        else:
-            if self.test('--', nparse):
+                pass
+
+        elif self.contains(nparse, '--'):
            nparse_rule = nparse.split('--')[0]
            excludes = nparse.split('--')[1:]
+
+        # if nparse_index > 0:
        #     print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
        return nparse_rule, nparse_index, excludes

@@ -105,28 +107,17 @@ class jsoup:
        :param ret: pd对象结果
        :return:
        """
-        if self.test(':eq', nparse):
-            nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
-            if not ret:
-                ret = doc(nparse_rule).eq(nparse_index)
-                # if nparse_index > 4:
-                #     print('1nparse_index',ret,not ret)
-            else:
-                ret = ret(nparse_rule).eq(nparse_index)
-                # if nparse_index > 4:
-                #     print('2nparse_index',ret)
-            if excludes and ret:
-                # print(excludes)
-                ret = ret.clone()  # 克隆一个,免得直接remove会影响doc的缓存
-                for exclude in excludes:
-                    # ret.remove(exclude)
-                    ret(exclude).remove()
-        else:
        nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
        if not ret:
            ret = doc(nparse_rule)
        else:
            ret = ret(nparse_rule)
+
+        if self.contains(nparse, ':eq'):
+            ret = ret.eq(nparse_index)
+            # if nparse_index > 4:
+            #     print('nparse_index',ret,not ret)
+
        if excludes and ret:
            # print(excludes)
            ret = ret.clone()  # 克隆一个,免得直接remove会影响doc的缓存
@@ -177,7 +168,7 @@ class jsoup:
            return doc.html()

        option = None
-        if parse.find('&&') > -1:
+        if self.contains(parse, '&&'):
            option = parse.split('&&')[-1]
            parse = '&&'.join(parse.split('&&')[:-1])
        parse = self.parseHikerToJq(parse, True)
@@ -198,14 +189,15 @@ class jsoup:
                ret = ret.html()
            else:
                ret = ret.attr(option) or ''
-                if option.lower().find('style') > -1 and ret.find('url(') > -1:
+                if self.contains(option.lower(), 'style') and self.contains(ret, 'url('):
                    try:
                        ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
                    except:
                        pass

                if ret and base_url:
-                    need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
+                    # need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
+                    need_add = self.test(URLJOIN_ATTR, option)
                    if need_add:
                        if 'http' in ret:
                            ret = ret[ret.find('http'):]