From 262dd4387c7d23571b44d6acb577c2b34baef9ce Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Sun, 20 Nov 2022 13:47:31 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=A7=A3=E6=9E=90=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=20=E4=BC=98=E5=8C=96js0=E8=A7=A3=E6=9E=90=E5=99=A8?= =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/rules.db | Bin 155648 -> 155648 bytes controllers/home.py | 7 ++- "py/\346\265\213\350\257\225pdf.py" | 8 ++- utils/htmlParser.py | 84 +++++++++++++--------------- 4 files changed, 49 insertions(+), 50 deletions(-) diff --git a/base/rules.db b/base/rules.db index 083de5f3bccb109e2f73c9786edf61b077bf159c..bc51657816e2962ba8d7df24d5425f4810037a86 100644 GIT binary patch delta 30 mcmZoTz}awsbAmMEl8G|Tj7v5qEa+x5XlCr$&e+5F|1toxnG6g7 delta 30 mcmZoTz}awsbAmME;)ycOjEgrWEa+x5Y-a4)&e+5F|1toxehdo$ diff --git a/controllers/home.py b/controllers/home.py index ad7831f..17e7993 100644 --- a/controllers/home.py +++ b/controllers/home.py @@ -327,10 +327,12 @@ def sort_parses_by_order(parses,host): parse_list = parse.query_all() parse_url_list = list(map(lambda x: x['url'], parse_list)) new_parses = [] + new_parses_url = [] for i in range(len(parses)): # parses[i]['id'] = i + 1 # 去重 - if parses[i]['url'] in new_parses: + if parses[i]['url'] in new_parses_url: + # print(f"重复的解析:{parses[i]['name']},{parses[i]['url']}") continue if str(parses[i]['url']).startswith(host): parses[i]['url'] = parses[i]['url'].replace(host,'') @@ -349,6 +351,7 @@ def sort_parses_by_order(parses,host): if str(parses[i]['url']).startswith('/'): parses[i]['url'] = host + parses[i]['url'] new_parses.append(parses[i]) + new_parses_url.append(parses[i]['url']) new_parses.sort(key=functools.cmp_to_key(comp), reverse=False) # print(sites) for par in new_parses: @@ -356,7 +359,7 @@ def sort_parses_by_order(parses,host): del par['order'] del par['write_date'] # print(new_parses) - logger.info(f'{len(parses)}条解析解析排序耗时:{get_interval(t1)}毫秒') + logger.info(f'{len(new_parses)}/{len(parses)}条解析解析排序耗时:{get_interval(t1)}毫秒') return new_parses @home.route('/configs') diff --git "a/py/\346\265\213\350\257\225pdf.py" "b/py/\346\265\213\350\257\225pdf.py" index 04386f9..f8dd6fd 100644 --- "a/py/\346\265\213\350\257\225pdf.py" +++ "b/py/\346\265\213\350\257\225pdf.py" @@ -90,8 +90,12 @@ def main3(): a = jsp.pdfh(html, 'div p:first--#exd1') print(a) + html = requests.get('https://www.leyupro.com/lyd/139451.html').text + a = jsp.pdfa(html,'.yunplay&&.downtitle&&ul li') + print(a) + if __name__ == '__main__': - main() + # main() # main1() # main2() - # main3() \ No newline at end of file + main3() \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index 1353b66..75c875f 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -36,6 +36,10 @@ class jsoup: test_ret = True if searchObj else False return test_ret + def contains(self, text: str, match: str): + # return match in text + return text.find(match) > -1 + def parseHikerToJq(self, parse, first=False): """ 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) @@ -43,7 +47,7 @@ class jsoup: :param first: :return: """ - if parse.find('&&') > -1: + if self.contains(parse, '&&'): parse = parse.split('&&') # 带&&的重新拼接 new_parses = [] # 构造新的解析表达式列表 for i in range(len(parse)): @@ -72,30 +76,28 @@ class jsoup: excludes = [] # 定义排除列表默认值为空 nparse_index = 0 # 定义位置索引默认值为0 nparse_rule = nparse # 定义规则默认值为本身 - if self.test(':eq', nparse): + if self.contains(nparse, ':eq'): nparse_rule = nparse.split(':eq')[0] nparse_pos = nparse.split(':eq')[1] # print(nparse_rule) - if self.test('--', nparse_rule): + if self.contains(nparse_rule, '--'): excludes = nparse_rule.split('--')[1:] nparse_rule = nparse_rule.split('--')[0] - elif self.test('--', nparse_pos): + elif self.contains(nparse_pos, '--'): excludes = nparse_pos.split('--')[1:] nparse_pos = nparse_pos.split('--')[0] try: - nparse_index = nparse_pos.split('(')[1].split(')')[0] - nparse_index = int(nparse_index) + nparse_index = int(nparse_pos.split('(')[1].split(')')[0]) except: - nparse_index = 0 - if nparse_index > 0: - print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}') - return nparse_rule, nparse_index, excludes - else: - if self.test('--', nparse): - nparse_rule = nparse.split('--')[0] - excludes = nparse.split('--')[1:] - # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}') - return nparse_rule, nparse_index, excludes + pass + + elif self.contains(nparse, '--'): + nparse_rule = nparse.split('--')[0] + excludes = nparse.split('--')[1:] + + # if nparse_index > 0: + # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}') + return nparse_rule, nparse_index, excludes def parseOneRule(self, doc, nparse, ret=None): """ @@ -105,34 +107,23 @@ class jsoup: :param ret: pd对象结果 :return: """ - if self.test(':eq', nparse): - nparse_rule, nparse_index, excludes = self.getParseInfo(nparse) - if not ret: - ret = doc(nparse_rule).eq(nparse_index) - # if nparse_index > 4: - # print('1nparse_index',ret,not ret) - else: - ret = ret(nparse_rule).eq(nparse_index) - # if nparse_index > 4: - # print('2nparse_index',ret) - if excludes and ret: - # print(excludes) - ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存 - for exclude in excludes: - # ret.remove(exclude) - ret(exclude).remove() + nparse_rule, nparse_index, excludes = self.getParseInfo(nparse) + if not ret: + ret = doc(nparse_rule) else: - nparse_rule, nparse_index, excludes = self.getParseInfo(nparse) - if not ret: - ret = doc(nparse_rule) - else: - ret = ret(nparse_rule) - if excludes and ret: - # print(excludes) - ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存 - for exclude in excludes: - # ret.remove(exclude) - ret(exclude).remove() + ret = ret(nparse_rule) + + if self.contains(nparse, ':eq'): + ret = ret.eq(nparse_index) + # if nparse_index > 4: + # print('nparse_index',ret,not ret) + + if excludes and ret: + # print(excludes) + ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存 + for exclude in excludes: + # ret.remove(exclude) + ret(exclude).remove() return ret def pdfa(self, html, parse: str): @@ -177,7 +168,7 @@ class jsoup: return doc.html() option = None - if parse.find('&&') > -1: + if self.contains(parse, '&&'): option = parse.split('&&')[-1] parse = '&&'.join(parse.split('&&')[:-1]) parse = self.parseHikerToJq(parse, True) @@ -198,14 +189,15 @@ class jsoup: ret = ret.html() else: ret = ret.attr(option) or '' - if option.lower().find('style') > -1 and ret.find('url(') > -1: + if self.contains(option.lower(), 'style') and self.contains(ret, 'url('): try: ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0] except: pass if ret and base_url: - need_add = re.search(URLJOIN_ATTR, option, re.M | re.I) + # need_add = re.search(URLJOIN_ATTR, option, re.M | re.I) + need_add = self.test(URLJOIN_ATTR, option) if need_add: if 'http' in ret: ret = ret[ret.find('http'):] -- GitLab