Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
落寞的温暖
dr_py
提交
44300bfd
dr_py
项目概览
落寞的温暖
/
dr_py
与 Fork 源项目一致
Fork自
晚风拂柳颜 / dr_py
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
dr_py
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
44300bfd
编写于
11月 14, 2022
作者:
H
hjdhnx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
修改js0相应的解析库pdfa和pdfh,更加精准了,支持eq负数
修复看视界
上级
22778e60
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
320 addition
and
97 deletion
+320
-97
base/rules.db
base/rules.db
+0
-0
js/version.txt
js/version.txt
+1
-1
js/看视界.js
js/看视界.js
+2
-0
utils/htmlParseerOld.py
utils/htmlParseerOld.py
+210
-0
utils/htmlParser.py
utils/htmlParser.py
+107
-96
未找到文件。
base/rules.db
浏览文件 @
44300bfd
无法预览此类型文件
js/version.txt
浏览文件 @
44300bfd
3.9.20beta8
\ No newline at end of file
3.8.8
\ No newline at end of file
js/看视界.js
浏览文件 @
44300bfd
var
rule
=
Object
.
assign
(
muban
.
mxone5
,{
title
:
'
看视界
'
,
host
:
'
https://www.1080kan.cc
'
,
headers
:{
'
User-Agent
'
:
'
MOBILE_UA
'
},
});
\ No newline at end of file
utils/htmlParseerOld.py
0 → 100644
浏览文件 @
44300bfd
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
import
json
from
pyquery
import
PyQuery
as
pq
from
lxml
import
etree
from
urllib.parse
import
urljoin
import
re
from
jsonpath
import
jsonpath
PARSE_CACHE
=
True
# 解析缓存
class
jsoup
:
def
__init__
(
self
,
MY_URL
=
''
):
self
.
MY_URL
=
MY_URL
self
.
pdfh_html
=
''
self
.
pdfa_html
=
''
self
.
pdfh_doc
=
None
self
.
pdfa_doc
=
None
def
test
(
self
,
text
:
str
,
string
:
str
):
searchObj
=
re
.
search
(
rf
'
{
text
}
'
,
string
,
re
.
M
|
re
.
I
)
test_ret
=
True
if
searchObj
else
False
return
test_ret
def
pdfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
parse
:
return
''
if
PARSE_CACHE
:
if
self
.
pdfh_html
!=
html
:
self
.
pdfh_html
=
html
self
.
pdfh_doc
=
pq
(
html
)
doc
=
self
.
pdfh_doc
else
:
doc
=
pq
(
html
)
if
parse
==
'body&&Text'
or
parse
==
'Text'
:
text
=
doc
.
text
()
return
text
elif
parse
==
'body&&Html'
or
parse
==
'Html'
:
return
doc
.
html
()
option
=
None
if
parse
.
find
(
'&&'
)
>
-
1
:
option
=
parse
.
split
(
'&&'
)[
-
1
]
parse
=
parse
.
split
(
'&&'
)[:
-
1
]
# 如果只有一个&& 取的就直接是0
if
len
(
parse
)
>
1
:
# 如果不大于1可能就是option操作,不需要拼eq
parse
=
' '
.
join
([
i
if
self
.
test
(
':eq|:lt|:gt|#'
,
i
)
else
f
'
{
i
}
:eq(0)'
for
i
in
parse
])
else
:
parse
=
parse
[
0
]
if
self
.
test
(
':eq|:lt|:gt|#'
,
parse
[
0
])
else
f
'
{
parse
[
0
]
}
:eq(0)'
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性
if
option
:
# print(f'parse:{parse}=>(option:{option})')
if
':eq(-1)'
in
parse
:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret
=
doc
(
parse
.
replace
(
':eq(-1)'
,
''
)).
eq
(
-
1
)
else
:
ret
=
doc
(
parse
)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if
option
==
'Text'
:
ret
=
ret
.
text
()
elif
option
==
'Html'
:
ret
=
ret
.
html
()
else
:
ret
=
ret
.
attr
(
option
)
or
''
if
option
.
lower
().
find
(
'style'
)
>-
1
and
ret
.
find
(
'url('
)
>-
1
:
try
:
ret
=
re
.
search
(
'url\((.*?)\)'
,
ret
,
re
.
M
|
re
.
S
).
groups
()[
0
]
except
:
pass
if
ret
and
add_url
:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
# need_add = option in pd_list
need_add
=
re
.
search
(
'(url|src|href|-original|-src|-play|-url)$'
,
option
,
re
.
M
|
re
.
I
)
# print(f'option:{option},need_add:{need_add}')
if
need_add
:
if
'http'
in
ret
:
ret
=
ret
[
ret
.
find
(
'http'
):]
else
:
ret
=
urljoin
(
self
.
MY_URL
,
ret
)
# print(ret)
else
:
# ret = doc(parse+':first')
ret
=
doc
(
parse
)
# 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret
=
ret
.
outerHtml
()
return
ret
def
pdfa
(
self
,
html
,
parse
:
str
):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if
not
parse
:
return
[]
if
parse
.
find
(
'&&'
)
>
-
1
:
parse
=
parse
.
split
(
'&&'
)
# 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse
=
' '
.
join
([
parse
[
i
]
if
self
.
test
(
':eq|:lt|:gt'
,
parse
[
i
])
or
i
>=
len
(
parse
)
-
1
else
f
'
{
parse
[
i
]
}
:eq(0)'
for
i
in
range
(
len
(
parse
))])
print
(
f
'pdfa:
{
parse
}
'
)
# print(html)
if
PARSE_CACHE
:
if
self
.
pdfa_html
!=
html
:
self
.
pdfa_html
=
html
self
.
pdfa_doc
=
pq
(
html
)
doc
=
self
.
pdfa_doc
else
:
doc
=
pq
(
html
)
result
=
doc
(
parse
)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
# print(res)
res
=
[
item
.
outerHtml
()
for
item
in
result
.
items
()]
# 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
return
res
def
pd
(
self
,
html
,
parse
:
str
):
return
self
.
pdfh
(
html
,
parse
,
True
)
def
pq
(
self
,
html
:
str
):
return
pq
(
html
)
def
pjfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
parse
:
return
''
if
isinstance
(
html
,
str
):
# print(html)
try
:
html
=
json
.
loads
(
html
)
# html = eval(html)
except
:
print
(
'字符串转json失败'
)
return
''
if
not
parse
.
startswith
(
'$.'
):
parse
=
f
'$.
{
parse
}
'
ret
=
''
for
ps
in
parse
.
split
(
'||'
):
ret
=
jsonpath
(
html
,
ps
)
if
isinstance
(
ret
,
list
):
ret
=
str
(
ret
[
0
])
if
ret
[
0
]
else
''
else
:
ret
=
str
(
ret
)
if
ret
else
''
if
add_url
and
ret
:
ret
=
urljoin
(
self
.
MY_URL
,
ret
)
if
ret
:
break
# print(ret)
return
ret
def
pj
(
self
,
html
,
parse
:
str
):
return
self
.
pjfh
(
html
,
parse
,
True
)
def
pjfa
(
self
,
html
,
parse
:
str
):
if
not
parse
:
return
[]
if
isinstance
(
html
,
str
):
try
:
html
=
json
.
loads
(
html
)
except
:
return
''
if
not
parse
.
startswith
(
'$.'
):
parse
=
f
'$.
{
parse
}
'
# print(html)
# print(parse)
ret
=
jsonpath
(
html
,
parse
)
# print(ret)
# print(type(ret))
# print(type(ret[0]))
# print(len(ret))
if
isinstance
(
ret
,
list
)
and
isinstance
(
ret
[
0
],
list
)
and
len
(
ret
)
==
1
:
# print('自动解包')
ret
=
ret
[
0
]
# 自动解包
return
ret
or
[]
if
__name__
==
'__main__'
:
import
requests
from
parsel
import
Selector
url
=
'http://360yy.cn'
jsp
=
jsoup
(
url
)
def
pdfa2
(
html
,
parse
):
if
not
parse
:
return
[]
if
parse
.
find
(
'&&'
)
>
-
1
:
parse
=
parse
.
split
(
'&&'
)
# 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse
=
' '
.
join
([
parse
[
i
]
if
jsoup
().
test
(
':eq|:lt|:gt'
,
parse
[
i
])
or
i
>=
len
(
parse
)
-
1
else
f
'
{
parse
[
i
]
}
:nth-child(1)'
for
i
in
range
(
len
(
parse
))])
# print(f'pdfa:{parse}')
selector
=
Selector
(
text
=
html
)
print
(
parse
)
items
=
selector
.
css
(
parse
)
return
[
str
(
item
)
for
item
in
items
]
r
=
requests
.
get
(
url
)
html
=
r
.
text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items
=
pdfa2
(
html
,
'.fed-pops-navbar&&ul.fed-part-rows&&a'
)
print
(
items
)
utils/htmlParser.py
浏览文件 @
44300bfd
...
...
@@ -3,18 +3,20 @@
# File : htmlParser.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2022/8/25
import
json
import
ujson
from
pyquery
import
PyQuery
as
pq
from
lxml
import
etree
from
urllib.parse
import
urljoin
import
re
from
jsonpath
import
jsonpath
PARSE_CACHE
=
True
# 解析缓存
NOADD_INDEX
=
':eq|:lt|:gt|^body$|^#'
# 不自动加eq下标索引
URLJOIN_ATTR
=
'(url|src|href|-original|-src|-play|-url)$'
# 需要自动urljoin的属性
class
jsoup
:
def
__init__
(
self
,
MY_URL
=
''
):
def
__init__
(
self
,
MY_URL
=
''
):
self
.
MY_URL
=
MY_URL
self
.
pdfh_html
=
''
self
.
pdfa_html
=
''
...
...
@@ -22,13 +24,19 @@ class jsoup:
self
.
pdfh_doc
=
None
self
.
pdfa_doc
=
None
def
test
(
self
,
text
:
str
,
string
:
str
):
def
test
(
self
,
text
:
str
,
string
:
str
):
"""
正则判断字符串包含,模仿js的 //.test()
:param text:
:param string:
:return:
"""
searchObj
=
re
.
search
(
rf
'
{
text
}
'
,
string
,
re
.
M
|
re
.
I
)
test_ret
=
True
if
searchObj
else
False
return
test_ret
def
pdfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
parse
:
def
pdfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
all
([
html
,
parse
])
:
return
''
if
PARSE_CACHE
:
if
self
.
pdfh_html
!=
html
:
...
...
@@ -42,71 +50,98 @@ class jsoup:
return
text
elif
parse
==
'body&&Html'
or
parse
==
'Html'
:
return
doc
.
html
()
option
=
None
if
parse
.
find
(
'&&'
)
>
-
1
:
option
=
parse
.
split
(
'&&'
)[
-
1
]
parse
=
parse
.
split
(
'&&'
)[:
-
1
]
# 如果只有一个&& 取的就直接是0
if
len
(
parse
)
>
1
:
# 如果不大于1可能就是option操作,不需要拼eq
parse
=
' '
.
join
([
i
if
self
.
test
(
':eq|:lt|:gt|#'
,
i
)
else
f
'
{
i
}
:eq(0)'
for
i
in
parse
])
else
:
parse
=
parse
[
0
]
if
self
.
test
(
':eq|:lt|:gt|#'
,
parse
[
0
])
else
f
'
{
parse
[
0
]
}
:eq(0)'
# FIXME 暂时不支持jsonpath那样的|| 分割取或属性
parse
=
'&&'
.
join
(
parse
.
split
(
'&&'
)[:
-
1
])
parse
=
self
.
parseHikerToJq
(
parse
,
True
)
# print(f'pdfh:{parse},option:{option}')
parses
=
parse
.
split
(
' '
)
ret
=
None
for
nparse
in
parses
:
ret
=
self
.
parseOneRule
(
doc
,
nparse
,
ret
)
# print(nparse,ret)
if
option
:
# print(f'parse:{parse}=>(option:{option})')
if
':eq(-1)'
in
parse
:
# 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
ret
=
doc
(
parse
.
replace
(
':eq(-1)'
,
''
)).
eq
(
-
1
)
else
:
ret
=
doc
(
parse
)
# print(html)
# FIXME 解析出来有多个的情况应该自动取第一个
if
option
==
'Text'
:
ret
=
ret
.
text
()
elif
option
==
'Html'
:
ret
=
ret
.
html
()
else
:
ret
=
ret
.
attr
(
option
)
or
''
if
option
.
lower
().
find
(
'style'
)
>-
1
and
ret
.
find
(
'url('
)
>
-
1
:
if
option
.
lower
().
find
(
'style'
)
>
-
1
and
ret
.
find
(
'url('
)
>
-
1
:
try
:
ret
=
re
.
search
(
'url\((.*?)\)'
,
ret
,
re
.
M
|
re
.
S
).
groups
()[
0
]
ret
=
re
.
search
(
'url\((.*?)\)'
,
ret
,
re
.
M
|
re
.
S
).
groups
()[
0
]
except
:
pass
if
ret
and
add_url
:
# pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
# need_add = option in pd_list
need_add
=
re
.
search
(
'(url|src|href|-original|-src|-play|-url)$'
,
option
,
re
.
M
|
re
.
I
)
# print(f'option:{option},need_add:{need_add}')
need_add
=
re
.
search
(
URLJOIN_ATTR
,
option
,
re
.
M
|
re
.
I
)
if
need_add
:
if
'http'
in
ret
:
ret
=
ret
[
ret
.
find
(
'http'
):]
else
:
ret
=
urljoin
(
self
.
MY_URL
,
ret
)
# print(ret)
ret
=
urljoin
(
self
.
MY_URL
,
ret
)
else
:
# ret = doc(parse+':first')
ret
=
doc
(
parse
)
# 由于是生成器,直接转str就能拿到第一条数据,不需要next
# ret = ret.next() # 取第一条数据
# ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first')
# ret = ret.children(':first')
# print(parse)
# ret = str(ret)
ret
=
ret
.
outerHtml
()
return
ret
def
pdfa
(
self
,
html
,
parse
:
str
):
def
parseOneRule
(
self
,
doc
,
nparse
,
ret
=
None
):
"""
解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
:param doc: pq(html) load 后的pq对象
:param nparse: 当前单个解析表达式
:param ret: pd对象结果
:return:
"""
if
self
.
test
(
':eq'
,
nparse
):
nparse_rule
=
nparse
.
split
(
':eq'
)[
0
]
nparse_index
=
nparse
.
split
(
':eq'
)[
1
].
split
(
'('
)[
1
].
split
(
')'
)[
0
]
try
:
nparse_index
=
int
(
nparse_index
)
except
:
nparse_index
=
0
if
not
ret
:
ret
=
doc
(
nparse_rule
).
eq
(
nparse_index
)
else
:
ret
=
ret
(
nparse_rule
)
else
:
if
not
ret
:
ret
=
doc
(
nparse
)
else
:
ret
=
ret
(
nparse
)
return
ret
def
parseHikerToJq
(
self
,
parse
,
first
=
False
):
"""
海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
:param parse:
:param first:
:return:
"""
if
parse
.
find
(
'&&'
)
>
-
1
:
parse
=
parse
.
split
(
'&&'
)
# 带&&的重新拼接
new_parses
=
[]
# 构造新的解析表达式列表
for
i
in
range
(
len
(
parse
)):
ps
=
parse
[
i
].
split
(
' '
)[
-
1
]
# 如果分割&&后带空格就取最后一个元素
if
not
self
.
test
(
NOADD_INDEX
,
ps
):
if
not
first
and
i
>=
len
(
parse
)
-
1
:
# 不传first且遇到最后一个,不用补eq(0)
new_parses
.
append
(
parse
[
i
])
else
:
new_parses
.
append
(
f
'
{
parse
[
i
]
}
:eq(0)'
)
else
:
new_parses
.
append
(
parse
[
i
])
parse
=
' '
.
join
(
new_parses
)
return
parse
def
pdfa
(
self
,
html
,
parse
:
str
):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if
not
parse
:
if
not
all
([
html
,
parse
])
:
return
[]
if
parse
.
find
(
'&&'
)
>
-
1
:
parse
=
parse
.
split
(
'&&'
)
# 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
parse
=
' '
.
join
([
parse
[
i
]
if
self
.
test
(
':eq|:lt|:gt'
,
parse
[
i
])
or
i
>=
len
(
parse
)
-
1
else
f
'
{
parse
[
i
]
}
:eq(0)'
for
i
in
range
(
len
(
parse
))])
parse
=
self
.
parseHikerToJq
(
parse
)
print
(
f
'pdfa:
{
parse
}
'
)
# print(html)
if
PARSE_CACHE
:
if
self
.
pdfa_html
!=
html
:
self
.
pdfa_html
=
html
...
...
@@ -114,32 +149,29 @@ class jsoup:
doc
=
self
.
pdfa_doc
else
:
doc
=
pq
(
html
)
result
=
doc
(
parse
)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
# print(res)
res
=
[
item
.
outerHtml
()
for
item
in
result
.
items
()]
# 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res)
# print('pdfa执行结果数:',len(res))
parses
=
parse
.
split
(
' '
)
ret
=
None
for
nparse
in
parses
:
ret
=
self
.
parseOneRule
(
doc
,
nparse
,
ret
)
# print(len(ret),nparse)
res
=
[
item
.
outerHtml
()
for
item
in
ret
.
items
()]
return
res
def
pd
(
self
,
html
,
parse
:
str
):
return
self
.
pdfh
(
html
,
parse
,
True
)
def
pd
(
self
,
html
,
parse
:
str
):
return
self
.
pdfh
(
html
,
parse
,
True
)
def
pq
(
self
,
html
:
str
):
def
pq
(
self
,
html
:
str
):
return
pq
(
html
)
def
pjfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
parse
:
def
pjfh
(
self
,
html
,
parse
:
str
,
add_url
=
False
):
if
not
all
([
html
,
parse
])
:
return
''
if
isinstance
(
html
,
str
):
if
isinstance
(
html
,
str
):
# print(html)
try
:
html
=
json
.
loads
(
html
)
# html = eval(html)
html
=
u
json
.
loads
(
html
)
# html = eval(html)
except
:
print
(
'字符串转json失败'
)
return
''
...
...
@@ -147,8 +179,8 @@ class jsoup:
parse
=
f
'$.
{
parse
}
'
ret
=
''
for
ps
in
parse
.
split
(
'||'
):
ret
=
jsonpath
(
html
,
ps
)
if
isinstance
(
ret
,
list
):
ret
=
jsonpath
(
html
,
ps
)
if
isinstance
(
ret
,
list
):
ret
=
str
(
ret
[
0
])
if
ret
[
0
]
else
''
else
:
ret
=
str
(
ret
)
if
ret
else
''
...
...
@@ -159,52 +191,31 @@ class jsoup:
# print(ret)
return
ret
def
pj
(
self
,
html
,
parse
:
str
):
def
pj
(
self
,
html
,
parse
:
str
):
return
self
.
pjfh
(
html
,
parse
,
True
)
def
pjfa
(
self
,
html
,
parse
:
str
):
if
not
parse
:
def
pjfa
(
self
,
html
,
parse
:
str
):
if
not
all
([
html
,
parse
])
:
return
[]
if
isinstance
(
html
,
str
):
if
isinstance
(
html
,
str
):
try
:
html
=
json
.
loads
(
html
)
html
=
u
json
.
loads
(
html
)
except
:
return
''
return
[]
if
not
parse
.
startswith
(
'$.'
):
parse
=
f
'$.
{
parse
}
'
# print(html)
# print(parse)
ret
=
jsonpath
(
html
,
parse
)
ret
=
jsonpath
(
html
,
parse
)
# print(ret)
# print(type(ret))
# print(type(ret[0]))
# print(len(ret))
if
isinstance
(
ret
,
list
)
and
isinstance
(
ret
[
0
],
list
)
and
len
(
ret
)
==
1
:
if
isinstance
(
ret
,
list
)
and
isinstance
(
ret
[
0
],
list
)
and
len
(
ret
)
==
1
:
# print('自动解包')
ret
=
ret
[
0
]
# 自动解包
ret
=
ret
[
0
]
# 自动解包
return
ret
or
[]
if
__name__
==
'__main__'
:
import
requests
from
parsel
import
Selector
url
=
'http://360yy.cn'
jsp
=
jsoup
(
url
)
def
pdfa2
(
html
,
parse
):
if
not
parse
:
return
[]
if
parse
.
find
(
'&&'
)
>
-
1
:
parse
=
parse
.
split
(
'&&'
)
# 带&&的重新拼接
# print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
# parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
parse
=
' '
.
join
([
parse
[
i
]
if
jsoup
().
test
(
':eq|:lt|:gt'
,
parse
[
i
])
or
i
>=
len
(
parse
)
-
1
else
f
'
{
parse
[
i
]
}
:nth-child(1)'
for
i
in
range
(
len
(
parse
))])
# print(f'pdfa:{parse}')
selector
=
Selector
(
text
=
html
)
print
(
parse
)
items
=
selector
.
css
(
parse
)
return
[
str
(
item
)
for
item
in
items
]
r
=
requests
.
get
(
url
)
html
=
r
.
text
# parsel 不好用啊,很难实现封装pdfa之类的函数
items
=
pdfa2
(
html
,
'.fed-pops-navbar&&ul.fed-part-rows&&a'
)
print
(
items
)
if
__name__
==
'__main__'
:
pass
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录