From 69e781d1a5729dccb6cff8984b78ea01babad65e Mon Sep 17 00:00:00 2001 From: hjdhnx Date: Fri, 30 Sep 2022 19:22:49 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E8=AF=95=E7=9C=9F=E4=B8=8D=E5=8D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/rules.db | Bin 86016 -> 86016 bytes controllers/parse.py | 21 +++++++++- txt/pluto/drpy.js | 97 ++++++++++++++++++++++++++++++++----------- 3 files changed, 92 insertions(+), 26 deletions(-) diff --git a/base/rules.db b/base/rules.db index 5dc5a6f5c5b512d1210c9bc28a21cc716ee2aa33..71945bf4eae8554f6a5cbec42e176a839fc75d35 100644 GIT binary patch delta 150 zcmZozz}m2Yb%HeG(}^<9j87XAwk9xc6A&^sP%yNxGBL9+@}AP6)2-YYjV7-@!X;&CW?EvBVw90OF!VwjXMM*(06VM8XI diff --git a/controllers/parse.py b/controllers/parse.py index 54632ef..eee5d68 100644 --- a/controllers/parse.py +++ b/controllers/parse.py @@ -7,7 +7,9 @@ from flask import Blueprint, jsonify,redirect from utils.web import getParmas,get_interval import os from utils.log import logger +from utils.encode import OcrApi from utils.pyctx import py_ctx,getPreJs,runJScode,JsObjectWrapper,PyJsString,parseText,jsoup,time +import base64 parse = Blueprint("parse", __name__) @@ -89,4 +91,21 @@ def parse_home(filename): except Exception as e: msg = f'{filename}解析出错:{e}' logger.info(msg) - return R.failed(msg,extra={'time':f'{get_interval(t1)}毫秒','from':url}) \ No newline at end of file + return R.failed(msg,extra={'time':f'{get_interval(t1)}毫秒','from':url}) + +@parse.route('/ocr',methods=['POST']) +def base64_ocr(): + # print('params:',getParmas()) + img = getParmas('img') + # print(img) + img_bytes = base64.b64decode(img) + # print(img_bytes) + img_path = 'txt/pluto' + os.makedirs(img_path,exist_ok=True) + with open(f'{img_path}/yzm.png','wb+') as f: + f.write(img_bytes) + ocr = OcrApi('http://dm.mudery.com:10000') + code = ocr.classification(img_bytes) + resp = R.success('识别成功',code) + print(resp.json) + return resp \ No newline at end of file diff --git a/txt/pluto/drpy.js b/txt/pluto/drpy.js index 366fe3d..78dfc5d 100644 --- a/txt/pluto/drpy.js +++ b/txt/pluto/drpy.js @@ -8,13 +8,15 @@ import ch from './cheerio.min.js'; const key = 'drpy_zbk'; function init_test(){ + console.log("init_test_start"); + // clearItem(RULE_CK); console.log(JSON.stringify(rule)); // console.log(request('https://www.baidu.com',{withHeaders:true})); - console.log(request('https://www.baidu.com/favicon.ico',{toBase64:true})); - console.log("init_test"); - require('http://192.168.10.99:5705/txt/pluto/drT.js'); - console.log(typeof(drT)); - console.log(drT.renderText('{{fl.cate}},hi, {{fl}}哈哈.{{fl}}',{sort: 1,cate:'movie'},'fl')); + // console.log(request('https://www.baidu.com/favicon.ico',{toBase64:true})); + // require('http://192.168.10.99:5705/txt/pluto/drT.js'); + // console.log(typeof(drT)); + // console.log(drT.renderText('{{fl.cate}},hi, {{fl}}哈哈.{{fl}}',{sort: 1,cate:'movie'},'fl')); + console.log("init_test_end"); } let rule = { @@ -71,8 +73,9 @@ const RULE_CK = 'cookie'; // 源cookie的key值 const KEY = typeof(key)!=='undefined'&&key?key:'drpy_'+rule.title; // 源的唯一标识 const CATE_EXCLUDE = '首页|留言|APP|下载|资讯|新闻|动态'; const TAB_EXCLUDE = '猜你|喜欢|APP|下载|剧情|热播'; -const OCR_RETRY = 3;//ocr验证重试次数 -const OCR_API = 'http://dm.mudery.com:10000';//ocr在线识别接口 +const OCR_RETRY = 1;//ocr验证重试次数 +// const OCR_API = 'http://dm.mudery.com:10000';//ocr在线识别接口 +const OCR_API = 'http://192.168.3.239:5705/parse/ocr';//ocr在线识别接口 var MY_URL; // 全局注入变量,pd函数需要 /** 处理一下 rule规则关键字段没传递的情况 **/ @@ -115,7 +118,9 @@ var OcrApi={ classification:function (img){ // img是byte类型,这里不方便搞啊 let code = ''; try { - code = request(this.api,{data:img,headers:{'user-agent':PC_UA},'method':'POST'}); + let html = request(this.api,{data:{img:img},headers:{'User-Agent':PC_UA},'method':'POST'}); + html = JSON.parse(html); + code = html.url||''; }catch (e) {} return code } @@ -132,17 +137,35 @@ function verifyCode(url){ while (cnt < OCR_RETRY){ try{ // let obj = {headers:headers,timeout:timeout}; - let img = request(`${host}/index.php/verify/index.html`); + let yzm_url = `${host}/index.php/verify/index.html`; + console.log(`验证码链接:${yzm_url}`); + let hhtml = request(yzm_url,{withHeaders:true,toBase64:true}); + let json = JSON.parse(hhtml); + if(!cookie){ + cookie = json['set-cookie']?json['set-cookie'].split(';')[0]:''; + } + // console.log(hhtml); + console.log('cookie:'+cookie); + let img = json.body; + // console.log(img); let code = OcrApi.classification(img); console.log(`第${cnt+1}次验证码识别结果:${code}`); - let html = request(`${host}/index.php/ajax/verify_check?type=search&verify=${code}`,{'method':'POST'}); + let submit_url = `${host}/index.php/ajax/verify_check?type=search&verify=${code}`; + console.log(submit_url); + let html = request(submit_url,{headers:{Cookie:cookie,'User-Agent':MOBILE_UA},'method':'POST'}); + console.log(html); html = JSON.parse(html); if(html.msg === 'ok'){ - cookie = ''; + console.log(`第${cnt+1}次验证码提交成功`); return cookie // 需要返回cookie + }else if(html.msg!=='ok'&&cnt+1>=OCR_RETRY){ + cookie = ''; // 需要清空返回cookie } }catch (e) { - console.log(`第${cnt+1}次验证码提交失败`) + console.log(`第${cnt+1}次验证码提交失败:${e.message}`); + if(cnt+1>=OCR_RETRY){ + cookie = ''; + } } cnt+=1 } @@ -459,13 +482,13 @@ function homeVodParse(homeVodObj){ console.log('double:'+homeVodObj.double); if(homeVodObj.double){ p[0] = p[0].trim().startsWith('json:')?p[0].replace('json:',''):p[0]; - console.log(p[0]); + // console.log(p[0]); let items = pdfa(html, p[0]); - console.log(items.length); + // console.log(items.length); for(let item of items){ - console.log(p[1]); + // console.log(p[1]); let items2 = pdfa(item,p[1]); - console.log(items2.length); + // console.log(items2.length); for(let item2 of items2){ try { let title = pdfh(item2, p[2]); @@ -552,7 +575,32 @@ function categoryParse(cateObj) { return '{}' } let d = []; - let url = cateObj.url.replaceAll('fyclass', cateObj.tid).replaceAll('fypage', cateObj.pg); + // let url = cateObj.url.replaceAll('fyclass', cateObj.tid).replaceAll('fypage', cateObj.pg); + let url = cateObj.url.replaceAll('fyclass', cateObj.tid); + if(rule.filter_url){ + if(!/fyfilter/.test(url)){ + if(!url.endsWith('&')&&!rule.filter_url.startsWith('&')){ + url+='&' + } + url+=rule.filter_url; + }else{ + url = url.replace('fyfilter', rule.filter_url); + } + url = drT.renderText(url,cateObj.filter); + } + if(/fypage/.test(url)){ + if(url.includes('(')&&url.includes(')')){ + let url_rep = url.match(/.*?\((.*)\)/)[1]; + let cnt_page = url_rep.replaceAll('fypage', cateObj.pg); + eval(`let cnt_pg=${cnt_page}`); + url = url.replaceAll(url_rep,cnt_pg).replaceAll('(','').replaceAll(')',''); + }else{ + url = url.replaceAll('fypage',cateObj.pg); + } + } + if(cateObj.pg === 1 && url.includes('[')&&url.includes(']')){ + url = url.split('[')[1].split(']')[0]; + } MY_URL = url; // setItem('MY_URL',MY_URL); console.log(MY_URL); @@ -750,20 +798,19 @@ function detailParse(detailObj){ let p1 = p.lists.replaceAll('#idv', tab_name).replaceAll('#id', i); tab_ext = tab_ext.replaceAll('#idv', tab_name).replaceAll('#id', i); console.log(p1); - console.log(645); - console.log(html); + // console.log(html); let vodList = []; try { - vodList = pdfa(html, p1) + vodList = pdfa(html, p1); + console.log('len(vodList):'+vodList.length); }catch (e) { - console.log(e.message) + // console.log(e.message); } - console.log(647); - console.log('len(vodList):'+vodList.length); let new_vod_list = []; let tabName = tab_ext?pdfh(html, tab_ext):tab_name; + console.log(tabName); vodList.forEach(it=>{ - new_vod_list.push(tabName+'$'+pD(it,'a&&href',MY_URL)); + new_vod_list.push(pdfh(it,'body&&Text')+'$'+pD(it,'a&&href',MY_URL)); }); let vlist = new_vod_list.join('#'); vod_tab_list.push(vlist); @@ -871,7 +918,7 @@ function category(tid, pg, filter, extend) { url: urljoin(rule.host, rule.url), 一级: rule.一级, tid: tid, - pg: pg, + pg: parseInt(pg), filter: filter, extend: extend }; -- GitLab