Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
言程序plus
dr_py
提交
69e781d1
dr_py
项目概览
言程序plus
/
dr_py
与 Fork 源项目一致
Fork自
晚风拂柳颜 / dr_py
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
dr_py
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
69e781d1
编写于
9月 30, 2022
作者:
H
hjdhnx
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
调试真不卡
上级
cd9b48db
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
92 addition
and
26 deletion
+92
-26
base/rules.db
base/rules.db
+0
-0
controllers/parse.py
controllers/parse.py
+20
-1
txt/pluto/drpy.js
txt/pluto/drpy.js
+72
-25
未找到文件。
base/rules.db
浏览文件 @
69e781d1
无法预览此类型文件
controllers/parse.py
浏览文件 @
69e781d1
...
...
@@ -7,7 +7,9 @@ from flask import Blueprint, jsonify,redirect
from
utils.web
import
getParmas
,
get_interval
import
os
from
utils.log
import
logger
from
utils.encode
import
OcrApi
from
utils.pyctx
import
py_ctx
,
getPreJs
,
runJScode
,
JsObjectWrapper
,
PyJsString
,
parseText
,
jsoup
,
time
import
base64
parse
=
Blueprint
(
"parse"
,
__name__
)
...
...
@@ -89,4 +91,21 @@ def parse_home(filename):
except
Exception
as
e
:
msg
=
f
'
{
filename
}
解析出错:
{
e
}
'
logger
.
info
(
msg
)
return
R
.
failed
(
msg
,
extra
=
{
'time'
:
f
'
{
get_interval
(
t1
)
}
毫秒'
,
'from'
:
url
})
\ No newline at end of file
return
R
.
failed
(
msg
,
extra
=
{
'time'
:
f
'
{
get_interval
(
t1
)
}
毫秒'
,
'from'
:
url
})
@
parse
.
route
(
'/ocr'
,
methods
=
[
'POST'
])
def
base64_ocr
():
# print('params:',getParmas())
img
=
getParmas
(
'img'
)
# print(img)
img_bytes
=
base64
.
b64decode
(
img
)
# print(img_bytes)
img_path
=
'txt/pluto'
os
.
makedirs
(
img_path
,
exist_ok
=
True
)
with
open
(
f
'
{
img_path
}
/yzm.png'
,
'wb+'
)
as
f
:
f
.
write
(
img_bytes
)
ocr
=
OcrApi
(
'http://dm.mudery.com:10000'
)
code
=
ocr
.
classification
(
img_bytes
)
resp
=
R
.
success
(
'识别成功'
,
code
)
print
(
resp
.
json
)
return
resp
\ No newline at end of file
txt/pluto/drpy.js
浏览文件 @
69e781d1
...
...
@@ -8,13 +8,15 @@ import ch from './cheerio.min.js';
const
key
=
'
drpy_zbk
'
;
function
init_test
(){
console
.
log
(
"
init_test_start
"
);
// clearItem(RULE_CK);
console
.
log
(
JSON
.
stringify
(
rule
));
// console.log(request('https://www.baidu.com',{withHeaders:true}));
console
.
log
(
request
(
'
https://www.baidu.com/favicon.ico
'
,{
toBase64
:
true
}));
console
.
log
(
"
init_test
"
);
require
(
'
http://192.168.10.99:5705/txt/pluto/drT.js
'
);
console
.
log
(
typeof
(
drT
));
console
.
log
(
drT
.
renderText
(
'
{{fl.cate}},hi, {{fl}}哈哈.{{fl}}
'
,{
sort
:
1
,
cate
:
'
movie
'
},
'
fl
'
)
);
//
console.log(request('https://www.baidu.com/favicon.ico',{toBase64:true}));
// require('http://192.168.10.99:5705/txt/pluto/drT.js'
);
// console.log(typeof(drT)
);
// console.log(drT.renderText('{{fl.cate}},hi, {{fl}}哈哈.{{fl}}',{sort: 1,cate:'movie'},'fl'
));
console
.
log
(
"
init_test_end
"
);
}
let
rule
=
{
...
...
@@ -71,8 +73,9 @@ const RULE_CK = 'cookie'; // 源cookie的key值
const
KEY
=
typeof
(
key
)
!==
'
undefined
'
&&
key
?
key
:
'
drpy_
'
+
rule
.
title
;
// 源的唯一标识
const
CATE_EXCLUDE
=
'
首页|留言|APP|下载|资讯|新闻|动态
'
;
const
TAB_EXCLUDE
=
'
猜你|喜欢|APP|下载|剧情|热播
'
;
const
OCR_RETRY
=
3
;
//ocr验证重试次数
const
OCR_API
=
'
http://dm.mudery.com:10000
'
;
//ocr在线识别接口
const
OCR_RETRY
=
1
;
//ocr验证重试次数
// const OCR_API = 'http://dm.mudery.com:10000';//ocr在线识别接口
const
OCR_API
=
'
http://192.168.3.239:5705/parse/ocr
'
;
//ocr在线识别接口
var
MY_URL
;
// 全局注入变量,pd函数需要
/** 处理一下 rule规则关键字段没传递的情况 **/
...
...
@@ -115,7 +118,9 @@ var OcrApi={
classification
:
function
(
img
){
// img是byte类型,这里不方便搞啊
let
code
=
''
;
try
{
code
=
request
(
this
.
api
,{
data
:
img
,
headers
:{
'
user-agent
'
:
PC_UA
},
'
method
'
:
'
POST
'
});
let
html
=
request
(
this
.
api
,{
data
:{
img
:
img
},
headers
:{
'
User-Agent
'
:
PC_UA
},
'
method
'
:
'
POST
'
});
html
=
JSON
.
parse
(
html
);
code
=
html
.
url
||
''
;
}
catch
(
e
)
{}
return
code
}
...
...
@@ -132,17 +137,35 @@ function verifyCode(url){
while
(
cnt
<
OCR_RETRY
){
try
{
// let obj = {headers:headers,timeout:timeout};
let
img
=
request
(
`
${
host
}
/index.php/verify/index.html`
);
let
yzm_url
=
`
${
host
}
/index.php/verify/index.html`
;
console
.
log
(
`验证码链接:
${
yzm_url
}
`
);
let
hhtml
=
request
(
yzm_url
,{
withHeaders
:
true
,
toBase64
:
true
});
let
json
=
JSON
.
parse
(
hhtml
);
if
(
!
cookie
){
cookie
=
json
[
'
set-cookie
'
]?
json
[
'
set-cookie
'
].
split
(
'
;
'
)[
0
]:
''
;
}
// console.log(hhtml);
console
.
log
(
'
cookie:
'
+
cookie
);
let
img
=
json
.
body
;
// console.log(img);
let
code
=
OcrApi
.
classification
(
img
);
console
.
log
(
`第
${
cnt
+
1
}
次验证码识别结果:
${
code
}
`
);
let
html
=
request
(
`
${
host
}
/index.php/ajax/verify_check?type=search&verify=
${
code
}
`
,{
'
method
'
:
'
POST
'
});
let
submit_url
=
`
${
host
}
/index.php/ajax/verify_check?type=search&verify=
${
code
}
`
;
console
.
log
(
submit_url
);
let
html
=
request
(
submit_url
,{
headers
:{
Cookie
:
cookie
,
'
User-Agent
'
:
MOBILE_UA
},
'
method
'
:
'
POST
'
});
console
.
log
(
html
);
html
=
JSON
.
parse
(
html
);
if
(
html
.
msg
===
'
ok
'
){
co
okie
=
''
;
co
nsole
.
log
(
`第
${
cnt
+
1
}
次验证码提交成功`
)
;
return
cookie
// 需要返回cookie
}
else
if
(
html
.
msg
!==
'
ok
'
&&
cnt
+
1
>=
OCR_RETRY
){
cookie
=
''
;
// 需要清空返回cookie
}
}
catch
(
e
)
{
console
.
log
(
`第
${
cnt
+
1
}
次验证码提交失败`
)
console
.
log
(
`第
${
cnt
+
1
}
次验证码提交失败:
${
e
.
message
}
`
);
if
(
cnt
+
1
>=
OCR_RETRY
){
cookie
=
''
;
}
}
cnt
+=
1
}
...
...
@@ -459,13 +482,13 @@ function homeVodParse(homeVodObj){
console
.
log
(
'
double:
'
+
homeVodObj
.
double
);
if
(
homeVodObj
.
double
){
p
[
0
]
=
p
[
0
].
trim
().
startsWith
(
'
json:
'
)?
p
[
0
].
replace
(
'
json:
'
,
''
):
p
[
0
];
console
.
log
(
p
[
0
]);
//
console.log(p[0]);
let
items
=
pdfa
(
html
,
p
[
0
]);
console
.
log
(
items
.
length
);
//
console.log(items.length);
for
(
let
item
of
items
){
console
.
log
(
p
[
1
]);
//
console.log(p[1]);
let
items2
=
pdfa
(
item
,
p
[
1
]);
console
.
log
(
items2
.
length
);
//
console.log(items2.length);
for
(
let
item2
of
items2
){
try
{
let
title
=
pdfh
(
item2
,
p
[
2
]);
...
...
@@ -552,7 +575,32 @@ function categoryParse(cateObj) {
return
'
{}
'
}
let
d
=
[];
let
url
=
cateObj
.
url
.
replaceAll
(
'
fyclass
'
,
cateObj
.
tid
).
replaceAll
(
'
fypage
'
,
cateObj
.
pg
);
// let url = cateObj.url.replaceAll('fyclass', cateObj.tid).replaceAll('fypage', cateObj.pg);
let
url
=
cateObj
.
url
.
replaceAll
(
'
fyclass
'
,
cateObj
.
tid
);
if
(
rule
.
filter_url
){
if
(
!
/fyfilter/
.
test
(
url
)){
if
(
!
url
.
endsWith
(
'
&
'
)
&&!
rule
.
filter_url
.
startsWith
(
'
&
'
)){
url
+=
'
&
'
}
url
+=
rule
.
filter_url
;
}
else
{
url
=
url
.
replace
(
'
fyfilter
'
,
rule
.
filter_url
);
}
url
=
drT
.
renderText
(
url
,
cateObj
.
filter
);
}
if
(
/fypage/
.
test
(
url
)){
if
(
url
.
includes
(
'
(
'
)
&&
url
.
includes
(
'
)
'
)){
let
url_rep
=
url
.
match
(
/.*
?\((
.*
)\)
/
)[
1
];
let
cnt_page
=
url_rep
.
replaceAll
(
'
fypage
'
,
cateObj
.
pg
);
eval
(
`let cnt_pg=
${
cnt_page
}
`
);
url
=
url
.
replaceAll
(
url_rep
,
cnt_pg
).
replaceAll
(
'
(
'
,
''
).
replaceAll
(
'
)
'
,
''
);
}
else
{
url
=
url
.
replaceAll
(
'
fypage
'
,
cateObj
.
pg
);
}
}
if
(
cateObj
.
pg
===
1
&&
url
.
includes
(
'
[
'
)
&&
url
.
includes
(
'
]
'
)){
url
=
url
.
split
(
'
[
'
)[
1
].
split
(
'
]
'
)[
0
];
}
MY_URL
=
url
;
// setItem('MY_URL',MY_URL);
console
.
log
(
MY_URL
);
...
...
@@ -750,20 +798,19 @@ function detailParse(detailObj){
let
p1
=
p
.
lists
.
replaceAll
(
'
#idv
'
,
tab_name
).
replaceAll
(
'
#id
'
,
i
);
tab_ext
=
tab_ext
.
replaceAll
(
'
#idv
'
,
tab_name
).
replaceAll
(
'
#id
'
,
i
);
console
.
log
(
p1
);
console
.
log
(
645
);
console
.
log
(
html
);
// console.log(html);
let
vodList
=
[];
try
{
vodList
=
pdfa
(
html
,
p1
)
vodList
=
pdfa
(
html
,
p1
);
console
.
log
(
'
len(vodList):
'
+
vodList
.
length
);
}
catch
(
e
)
{
console
.
log
(
e
.
message
)
// console.log(e.message);
}
console
.
log
(
647
);
console
.
log
(
'
len(vodList):
'
+
vodList
.
length
);
let
new_vod_list
=
[];
let
tabName
=
tab_ext
?
pdfh
(
html
,
tab_ext
):
tab_name
;
console
.
log
(
tabName
);
vodList
.
forEach
(
it
=>
{
new_vod_list
.
push
(
tabName
+
'
$
'
+
pD
(
it
,
'
a&&href
'
,
MY_URL
));
new_vod_list
.
push
(
pdfh
(
it
,
'
body&&Text
'
)
+
'
$
'
+
pD
(
it
,
'
a&&href
'
,
MY_URL
));
});
let
vlist
=
new_vod_list
.
join
(
'
#
'
);
vod_tab_list
.
push
(
vlist
);
...
...
@@ -871,7 +918,7 @@ function category(tid, pg, filter, extend) {
url
:
urljoin
(
rule
.
host
,
rule
.
url
),
一级
:
rule
.
一级
,
tid
:
tid
,
pg
:
p
g
,
pg
:
p
arseInt
(
pg
)
,
filter
:
filter
,
extend
:
extend
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录