Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
转角邂逅
dr_py
提交
569d64c8
dr_py
项目概览
转角邂逅
/
dr_py
与 Fork 源项目一致
Fork自
晚风拂柳颜 / dr_py
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
dr_py
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
569d64c8
编写于
3月 22, 2023
作者:
H
hjdhnx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
增加解析器的java文件
上级
f2468861
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
294 addition
and
0 deletion
+294
-0
utils/HtmlParser.java
utils/HtmlParser.java
+294
-0
未找到文件。
utils/HtmlParser.java
0 → 100644
浏览文件 @
569d64c8
package
com.github.tvbox.osc.util.js
;
import
android.text.TextUtils
;
import
com.quickjs.android.JSUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.MalformedURLException
;
import
java.net.URL
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
public
class
HtmlParser
{
private
static
String
pdfh_html
=
""
;
private
static
String
pdfa_html
=
""
;
private
static
final
Pattern
p
=
Pattern
.
compile
(
"url\\((.*?)\\)"
,
Pattern
.
MULTILINE
|
Pattern
.
DOTALL
);
private
static
final
Pattern
NOADD_INDEX
=
Pattern
.
compile
(
":eq|:lt|:gt|:first|:last|^body$|^#"
);
// 不自动加eq下标索引
private
static
final
Pattern
URLJOIN_ATTR
=
Pattern
.
compile
(
"(url|src|href|-original|-src|-play|-url)$"
,
Pattern
.
MULTILINE
|
Pattern
.
CASE_INSENSITIVE
);
// 需要自动urljoin的属性
private
static
Document
pdfh_doc
=
null
;
private
static
Document
pdfa_doc
=
null
;
public
static
String
joinUrl
(
String
parent
,
String
child
)
{
if
(
JSUtils
.
isEmpty
(
parent
))
{
return
child
;
}
URL
url
;
String
q
=
parent
;
try
{
url
=
new
URL
(
new
URL
(
parent
),
child
);
q
=
url
.
toExternalForm
();
}
catch
(
MalformedURLException
e
)
{
e
.
printStackTrace
();
}
// if (q.contains("#")) {
// q = q.replaceAll("^(.+?)#.*?$", "$1");
// }
return
q
;
}
public
static
class
Painfo
{
public
String
nparse_rule
;
public
int
nparse_index
;
public
List
<
String
>
excludes
;
}
private
static
Painfo
getParseInfo
(
String
nparse
)
{
/*
根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
:param nparse:
:return:*/
Painfo
painfo
=
new
Painfo
();
//List<String> excludes = new ArrayList<>(); //定义排除列表默认值为空
//int nparse_index; //定义位置索引默认值为0
painfo
.
nparse_rule
=
nparse
;
//定义规则默认值为本身
if
(
nparse
.
contains
(
":eq"
))
{
painfo
.
nparse_rule
=
nparse
.
split
(
":"
)[
0
];
String
nparse_pos
=
nparse
.
split
(
":"
)[
1
];
if
(
painfo
.
nparse_rule
.
contains
(
"--"
))
{
String
[]
rules
=
painfo
.
nparse_rule
.
split
(
"--"
);
painfo
.
excludes
=
new
ArrayList
<>(
Arrays
.
asList
(
rules
));
painfo
.
excludes
.
remove
(
0
);
painfo
.
nparse_rule
=
rules
[
0
];
}
else
if
(
nparse_pos
.
contains
(
"--"
))
{
String
[]
rules
=
nparse_pos
.
split
(
"--"
);
painfo
.
excludes
=
new
ArrayList
<>(
Arrays
.
asList
(
rules
));
painfo
.
excludes
.
remove
(
0
);
nparse_pos
=
rules
[
0
];
}
try
{
painfo
.
nparse_index
=
Integer
.
parseInt
(
nparse_pos
.
replace
(
"eq("
,
""
).
replace
(
")"
,
""
));
}
catch
(
Exception
e1
)
{
painfo
.
nparse_index
=
0
;
}
}
else
{
if
(
nparse
.
contains
(
"--"
))
{
String
[]
rules
=
painfo
.
nparse_rule
.
split
(
"--"
);
painfo
.
excludes
=
new
ArrayList
<>(
Arrays
.
asList
(
rules
));
painfo
.
excludes
.
remove
(
0
);
painfo
.
nparse_rule
=
rules
[
0
];
}
}
return
painfo
;
}
public
static
boolean
isIndex
(
String
str
)
{
if
(
JSUtils
.
isEmpty
(
str
))
{
return
false
;
}
for
(
String
str2
:
new
String
[]{
":eq"
,
":lt"
,
":gt"
,
":first"
,
":last"
,
"body"
,
"#"
})
{
if
(
str
.
contains
(
str2
))
{
if
(
str2
.
equals
(
"body"
)
||
str2
.
equals
(
"#"
))
{
return
str
.
startsWith
(
str2
);
}
return
true
;
}
}
return
false
;
}
public
static
boolean
isUrl
(
String
str
)
{
if
(
JSUtils
.
isEmpty
(
str
))
{
return
false
;
}
for
(
String
str2
:
new
String
[]{
"url"
,
"src"
,
"href"
,
"-original"
,
"-play"
})
{
if
(
str
.
contains
(
str2
))
{
return
true
;
}
}
return
false
;
}
private
static
String
parseHikerToJq
(
String
parse
,
boolean
first
)
{
/*
海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
:param parse:
:param first:
:return:
*/
// 不自动加eq下标索引
if
(
parse
.
contains
(
"&&"
))
{
String
[]
parses
=
parse
.
split
(
"&&"
);
//带&&的重新拼接
List
<
String
>
new_parses
=
new
ArrayList
<>();
//构造新的解析表达式列表
for
(
int
i
=
0
;
i
<
parses
.
length
;
i
++)
{
String
[]
pss
=
parses
[
i
].
split
(
" "
);
String
ps
=
pss
[
pss
.
length
-
1
];
//如果分割&&后带空格就取最后一个元素
Matcher
m
=
NOADD_INDEX
.
matcher
(
ps
);
//if (!isIndex(ps)) {
if
(!
m
.
find
())
{
if
(!
first
&&
i
>=
parses
.
length
-
1
)
{
//不传first且遇到最后一个,不用补eq(0)
new_parses
.
add
(
parses
[
i
]);
}
else
{
new_parses
.
add
(
parses
[
i
]
+
":eq(0)"
);
}
}
else
{
new_parses
.
add
(
parses
[
i
]);
}
}
parse
=
TextUtils
.
join
(
" "
,
new_parses
);
}
else
{
String
[]
pss
=
parse
.
split
(
" "
);
String
ps
=
pss
[
pss
.
length
-
1
];
//如果分割&&后带空格就取最后一个元素
Matcher
m
=
NOADD_INDEX
.
matcher
(
ps
);
//if (!isIndex(ps) && first) {
if
(!
m
.
find
()
&&
first
)
{
parse
=
parse
+
":eq(0)"
;
}
}
return
parse
;
}
public
static
String
parseDomForUrl
(
String
html
,
String
rule
,
String
add_url
)
{
if
(!
pdfh_html
.
equals
(
html
))
{
pdfh_html
=
html
;
pdfh_doc
=
Jsoup
.
parse
(
html
);
}
Document
doc
=
pdfh_doc
;
if
(
rule
.
equals
(
"body&&Text"
)
||
rule
.
equals
(
"Text"
))
{
return
doc
.
text
();
}
else
if
(
rule
.
equals
(
"body&&Html"
)
||
rule
.
equals
(
"Html"
))
{
return
doc
.
html
();
}
String
option
=
""
;
if
(
rule
.
contains
(
"&&"
))
{
String
[]
rs
=
rule
.
split
(
"&&"
);
option
=
rs
[
rs
.
length
-
1
];
List
<
String
>
excludes
=
new
ArrayList
<>(
Arrays
.
asList
(
rs
));
excludes
.
remove
(
rs
.
length
-
1
);
rule
=
TextUtils
.
join
(
"&&"
,
excludes
);
}
rule
=
parseHikerToJq
(
rule
,
true
);
String
[]
parses
=
rule
.
split
(
" "
);
Elements
ret
=
new
Elements
();
for
(
String
nparse
:
parses
)
{
ret
=
parseOneRule
(
doc
,
nparse
,
ret
);
if
(
ret
.
isEmpty
())
{
return
""
;
}
}
String
result
;
if
(
JSUtils
.
isNotEmpty
(
option
))
{
if
(
option
.
equals
(
"Text"
))
{
result
=
ret
.
text
();
}
else
if
(
option
.
equals
(
"Html"
))
{
result
=
ret
.
html
();
}
else
{
result
=
ret
.
attr
(
option
);
if
(
option
.
toLowerCase
().
contains
(
"style"
)
&&
result
.
contains
(
"url("
))
{
Matcher
m
=
p
.
matcher
(
result
);
if
(
m
.
find
())
{
result
=
m
.
group
(
1
);
}
}
if
(
JSUtils
.
isNotEmpty
(
result
)
&&
JSUtils
.
isNotEmpty
(
add_url
))
{
// 需要自动urljoin的属性
Matcher
m
=
URLJOIN_ATTR
.
matcher
(
option
);
//if (isUrl(option)) {
if
(
m
.
find
())
{
if
(
result
.
contains
(
"http"
))
{
result
=
result
.
substring
(
result
.
indexOf
(
"http"
));
}
else
{
result
=
joinUrl
(
add_url
,
result
);
}
}
}
}
}
else
{
result
=
ret
.
outerHtml
();
}
return
result
;
}
public
static
List
<
String
>
parseDomForArray
(
String
html
,
String
rule
)
{
if
(!
pdfa_html
.
equals
(
html
))
{
pdfa_html
=
html
;
pdfa_doc
=
Jsoup
.
parse
(
html
);
}
Document
doc
=
pdfa_doc
;
rule
=
parseHikerToJq
(
rule
,
false
);
String
[]
parses
=
rule
.
split
(
" "
);
Elements
ret
=
new
Elements
();
for
(
String
pars
:
parses
)
{
ret
=
parseOneRule
(
doc
,
pars
,
ret
);
if
(
ret
.
isEmpty
())
{
return
new
ArrayList
<>();
}
}
List
<
String
>
eleHtml
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
ret
.
size
();
i
++)
{
Element
element1
=
ret
.
get
(
i
);
eleHtml
.
add
(
element1
.
outerHtml
());
}
return
eleHtml
;
}
private
static
Elements
parseOneRule
(
Document
doc
,
String
nparse
,
Elements
ret
)
{
Painfo
painfo
=
getParseInfo
(
nparse
);
if
(
ret
.
isEmpty
())
{
ret
=
doc
.
select
(
painfo
.
nparse_rule
);
}
else
{
ret
=
ret
.
select
(
painfo
.
nparse_rule
);
}
if
(
nparse
.
contains
(
":eq"
))
{
if
(
painfo
.
nparse_index
<
0
){
ret
=
ret
.
eq
(
ret
.
size
()
+
painfo
.
nparse_index
);
}
else
{
ret
=
ret
.
eq
(
painfo
.
nparse_index
);
}
}
if
(
painfo
.
excludes
!=
null
&&
!
ret
.
isEmpty
())
{
ret
=
ret
.
clone
();
//克隆一个, 免得直接remove会影响doc的缓存
for
(
int
i
=
0
;
i
<
painfo
.
excludes
.
size
();
i
++)
{
ret
.
select
(
painfo
.
excludes
.
get
(
i
)).
remove
();
}
}
return
ret
;
}
public
static
List
<
String
>
parseDomForList
(
String
html
,
String
p1
,
String
list_text
,
String
list_url
,
String
add_url
)
{
if
(!
pdfa_html
.
equals
(
html
))
{
pdfa_html
=
html
;
pdfa_doc
=
Jsoup
.
parse
(
html
);
}
Document
doc
=
pdfa_doc
;
p1
=
parseHikerToJq
(
p1
,
false
);
String
[]
parses
=
p1
.
split
(
" "
);
Elements
ret
=
new
Elements
();
for
(
String
pars
:
parses
)
{
ret
=
parseOneRule
(
doc
,
pars
,
ret
);
if
(
ret
.
isEmpty
())
{
return
new
ArrayList
<>();
}
}
List
<
String
>
new_vod_list
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
ret
.
size
();
i
++){
String
it
=
ret
.
get
(
i
).
outerHtml
();
new_vod_list
.
add
(
parseDomForUrl
(
it
,
list_text
,
""
).
trim
()
+
'$'
+
parseDomForUrl
(
it
,
list_url
,
add_url
));
}
return
new_vod_list
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录