提交 a9b591aa 编写于 作者: H hjdhnx

修复js模式0二级title和desc与js模式1不一致的问题

修复pdfa函数定位的列表数据不对的问题
8号影院js模式0已完美
上级 2502945d
...@@ -30,7 +30,6 @@ ...@@ -30,7 +30,6 @@
{"key": "dr_剧迷", "name": "剧迷(道长)", "type": 1, "api": "{{host}}/vod?rule=剧迷&ext=txt/js/tg/剧迷.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_剧迷", "name": "剧迷(道长)", "type": 1, "api": "{{host}}/vod?rule=剧迷&ext=txt/js/tg/剧迷.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_大师兄影视", "name": "大师兄影视(道长)", "type": 1, "api": "{{host}}/vod?rule=大师兄影视&ext=txt/js/tg/大师兄影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_大师兄影视", "name": "大师兄影视(道长)", "type": 1, "api": "{{host}}/vod?rule=大师兄影视&ext=txt/js/tg/大师兄影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_天空影视", "name": "天空影视(道长)", "type": 1, "api": "{{host}}/vod?rule=天空影视&ext=txt/js/tg/天空影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_天空影视", "name": "天空影视(道长)", "type": 1, "api": "{{host}}/vod?rule=天空影视&ext=txt/js/tg/天空影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_完美看看", "name": "完美看看(道长)", "type": 1, "api": "{{host}}/vod?rule=完美看看&ext=txt/js/tg/完美看看.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_快云影院", "name": "快云影院(道长)", "type": 1, "api": "{{host}}/vod?rule=快云影院&ext=txt/js/tg/快云影院.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_快云影院", "name": "快云影院(道长)", "type": 1, "api": "{{host}}/vod?rule=快云影院&ext=txt/js/tg/快云影院.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_爱看影视", "name": "爱看影视(道长)", "type": 1, "api": "{{host}}/vod?rule=爱看影视&ext=txt/js/tg/爱看影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_爱看影视", "name": "爱看影视(道长)", "type": 1, "api": "{{host}}/vod?rule=爱看影视&ext=txt/js/tg/爱看影视.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
{"key": "dr_爱看电影", "name": "爱看电影(道长)", "type": 1, "api": "{{host}}/vod?rule=爱看电影&ext=txt/js/tg/爱看电影.js", "searchable": 2, "quickSearch": 0, "filterable": 0}, {"key": "dr_爱看电影", "name": "爱看电影(道长)", "type": 1, "api": "{{host}}/vod?rule=爱看电影&ext=txt/js/tg/爱看电影.js", "searchable": 2, "quickSearch": 0, "filterable": 0},
......
...@@ -864,8 +864,7 @@ class CMS: ...@@ -864,8 +864,7 @@ class CMS:
pdfa = jsp.pjfa if is_json else jsp.pdfa pdfa = jsp.pjfa if is_json else jsp.pdfa
pd = jsp.pj if is_json else jsp.pd pd = jsp.pj if is_json else jsp.pd
pq = jsp.pq pq = jsp.pq
obj = {} vod['vod_id'] = detailUrl
vod_name = ''
if not html: # 没传递html参数接下来智能获取 if not html: # 没传递html参数接下来智能获取
r = requests.get(url, headers=self.headers, timeout=self.timeout,verify=False) r = requests.get(url, headers=self.headers, timeout=self.timeout,verify=False)
html = self.checkHtml(r) html = self.checkHtml(r)
...@@ -874,16 +873,16 @@ class CMS: ...@@ -874,16 +873,16 @@ class CMS:
html = json.loads(html) html = json.loads(html)
if p.get('title'): if p.get('title'):
p1 = p['title'].split(';') p1 = p['title'].split(';')
vod_name = pdfh(html, p1[0]).replace('\n', ' ') vod['vod_name'] = pdfh(html, p1[0]).replace('\n', ' ').strip()
# title = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1]) vod['type_name'] = pdfh(html, p1[1]).replace('\n',' ').strip() if len(p1)>1 else ''
title = '\n'.join([','.join([pdfh(html, pp1).strip() for pp1 in i.split('+')]) for i in p1])
# print(title)
obj['title'] = title
if p.get('desc'): if p.get('desc'):
try: try:
p1 = p['desc'].split(';') p1 = p['desc'].split(';')
desc = '\n'.join([pdfh(html, i).replace('\n', ' ') for i in p1]) vod['vod_remarks'] = pdfh(html, p1[0]).replace('\n', '').strip()
obj['desc'] = desc vod['vod_year'] = pdfh(html, p1[1]).replace('\n', ' ').strip() if len(p1) > 1 else ''
vod['vod_area'] = pdfh(html, p1[2]).replace('\n', ' ').strip() if len(p1) > 2 else ''
vod['vod_actor'] = pdfh(html, p1[3]).replace('\n', ' ').strip() if len(p1) > 3 else ''
vod['vod_director'] = pdfh(html, p1[4]).replace('\n', ' ').strip() if len(p1) > 4 else ''
except: except:
pass pass
...@@ -891,7 +890,7 @@ class CMS: ...@@ -891,7 +890,7 @@ class CMS:
p1 = p['content'].split(';') p1 = p['content'].split(';')
try: try:
content = '\n'.join([pdfh(html, i).replace('\n', ' ') for i in p1]) content = '\n'.join([pdfh(html, i).replace('\n', ' ') for i in p1])
obj['content'] = content vod['vod_content'] = content
except: except:
pass pass
...@@ -899,23 +898,10 @@ class CMS: ...@@ -899,23 +898,10 @@ class CMS:
p1 = p['img'] p1 = p['img']
try: try:
img = pd(html, p1) img = pd(html, p1)
obj['img'] = img vod['vod_pic'] = img
except Exception as e: except Exception as e:
logger.info(f'二级图片定位失败,但不影响使用{e}') logger.info(f'二级图片定位失败,但不影响使用{e}')
vod = {
"vod_id": detailUrl,
"vod_name": vod_name,
"vod_pic": obj.get('img', ''),
"type_name": obj.get('title', ''),
"vod_year": "",
"vod_area": "",
"vod_remarks": obj.get('desc', ''),
"vod_actor": "",
"vod_director": "",
"vod_content": obj.get('content', '')
}
vod_play_from = '$$$' vod_play_from = '$$$'
playFrom = [] playFrom = []
init_flag = {'ctx':False} init_flag = {'ctx':False}
...@@ -966,19 +952,28 @@ class CMS: ...@@ -966,19 +952,28 @@ class CMS:
vHeader = vHeader.to_list() vHeader = vHeader.to_list()
vodHeader = vHeader vodHeader = vHeader
else: else:
# print(p['tabs'].split(';')[0]) tab_parse = p['tabs'].split(';')[0]
vHeader = pdfa(html, p['tabs'].split(';')[0]) # print('tab_parse:',tab_parse)
# print(f'线路列表数:{len((vodHeader))}') vHeader = pdfa(html, tab_parse)
# print(vodHeader) # print(vHeader)
print(f'二级线路定位列表数:{len((vHeader))}')
# print(vHeader[0].outerHtml())
# print(vHeader[0].toString())
# from lxml import etree
# print(str(etree.tostring(vHeader[0], pretty_print=True), 'utf-8'))
from lxml.html import tostring as html2str
# print(html2str(vHeader[0].root).decode('utf-8'))
if not is_json: if not is_json:
for v in vHeader: for v in vHeader:
# 过滤排除掉线路标题 # 过滤排除掉线路标题
v_title = pq(v).text() v_title = pq(v).text()
# print(v_title)
if self.tab_exclude and jsp.test(self.tab_exclude, v_title): if self.tab_exclude and jsp.test(self.tab_exclude, v_title):
continue continue
vodHeader.append(v_title) vodHeader.append(v_title)
else: else:
vodHeader = vHeader vodHeader = vHeader
print(f'过滤后真实线路列表数:{len((vodHeader))} {vodHeader}')
else: else:
vodHeader = ['道长在线'] vodHeader = ['道长在线']
...@@ -1010,9 +1005,11 @@ class CMS: ...@@ -1010,9 +1005,11 @@ class CMS:
else: else:
for i in range(len(vodHeader)): for i in range(len(vodHeader)):
tab_name = str(vodHeader[i]) tab_name = str(vodHeader[i])
# print(tab_name)
tab_ext = p['tabs'].split(';')[1] if len(p['tabs'].split(';')) > 1 else '' tab_ext = p['tabs'].split(';')[1] if len(p['tabs'].split(';')) > 1 else ''
p1 = p['lists'].replace('#idv', tab_name).replace('#id', str(i)) p1 = p['lists'].replace('#idv', tab_name).replace('#id', str(i))
tab_ext = tab_ext.replace('#idv', tab_name).replace('#id', str(i)) tab_ext = tab_ext.replace('#idv', tab_name).replace('#id', str(i))
# print(p1)
vodList = pdfa(html, p1) # 1条线路的选集列表 vodList = pdfa(html, p1) # 1条线路的选集列表
# print(vodList) # print(vodList)
# vodList = [pq(i).text()+'$'+pd(i,'a&&href') for i in vodList] # 拼接成 名称$链接 # vodList = [pq(i).text()+'$'+pd(i,'a&&href') for i in vodList] # 拼接成 名称$链接
...@@ -1028,7 +1025,7 @@ class CMS: ...@@ -1028,7 +1025,7 @@ class CMS:
vod_tab_list.append(vlist) vod_tab_list.append(vlist)
vod_play_url = vod_play_url.join(vod_tab_list) vod_play_url = vod_play_url.join(vod_tab_list)
print(vod_play_url) # print(vod_play_url)
vod['vod_play_from'] = vod_play_from vod['vod_play_from'] = vod_play_from
# print(vod_play_from) # print(vod_play_from)
vod['vod_play_url'] = vod_play_url vod['vod_play_url'] = vod_play_url
......
var rule={
title:'8号影院',
host:'http://www.8hysw.com',
// homeUrl:'/',
url:'/frim/fyclass-fypage.html',
searchUrl:'/search.php?page=fypage&searchword=**&searchtype=',
searchable:2,//是否启用全局搜索,
quickSearch:0,//是否启用快速搜索,
filterable:0,//是否启用分类筛选,
class_name:'电影&电视剧&综艺&动漫&日韩剧&国产剧&欧美剧&港台剧',
class_url:'1&2&3&4&16&13&15&14',
play_parse:true,
lazy:'',
limit:6,
tab_exclude:'本周热门|最近更新',
推荐:'.stui-pannel_bd;.stui-vodlist li;h4&&Text;.lazyload&&data-original;.text-right&&Text;a&&href',
double:true, // 推荐内容是否双层定位
一级:'.stui-vodlist.clearfix&&li;a&&title;.lazyload&&data-original;.text-right&&Text;a&&href',
二级:{"title":"h1&&Text;.stui-content__detail&&p&&Text","img":".lazyload&&data-original","desc":".data:eq(0)&&Text;.data:eq(1)&&Text;.data:eq(2)&&Text;.data:eq(3)&&Text","content":".desc&&Text","tabs":".stui-pannel__head.bottom-line h3","lists":".stui-content__playlist:eq(#id) li"},
搜索:muban.首图2.搜索2,
}
\ No newline at end of file
var rule = {
title:'KUBO影视',
host:'https://123kubo.tv',
// homeUrl:'/',
url:'/show/fyclass/page/fypage.html',
searchUrl:'/search/page/fypage/wd/**.html',
searchable:2,//是否启用全局搜索,
quickSearch:0,//是否启用快速搜索,
filterable:0,//是否启用分类筛选,
headers:{//网站的请求头,完整支持所有的,常带ua和cookies
'User-Agent':'MOBILE_UA',
// "Cookie": "searchneed=ok"
},
class_name:'电影&电视剧&综艺&动漫',
class_url:'1&2&3&4',
//class_parse:'.myui-panel-box&&ul&&li;a&&Text;a&&href;/v/(.*)/',
play_parse:true,
lazy:'',
limit:6,
推荐:'ul.hl-vod-list;li;a&&title;.hl-item-thumb.hl-lazy&&data-original;.hl-pic-text&&Text;a&&href',
double:true, // 推荐内容是否双层定位
一级:'.hl-list-item;a&&title;.hl-item-thumb.hl-lazy&&data-original;.hl-pic-text&&Text;a&&href',
二级:{"title":".hl-item-thumb.hl-lazy&&title;.hl-full-box&&ul li:eq(6)&&Text","img":".hl-item-thumb.hl-lazy&&data-original","desc":".hl-full-box&&ul&&li:eq(1)&&Text;.hl-full-box&&ul&&li:eq(2)&&Text;.hl-full-box&&ul&&li:eq(3)&&Text","content":".hl-col-xs-12.blurb&&Text","tabs":".hl-plays-from:eq(0) a","lists":".hl-plays-list:eq(#id) li"},
搜索:'.hl-item-div;a&&title;.hl-item-thumb&&data-original;.hl-lc-1&&Text;a&&href;.text-muted:eq(-1)&&Text',
}
var rule = {
title:'TV蜂',
host:'https://www.tvfeng.net',
// homeUrl:'/',
url:'/tvfenshow/fyclass--------fypage---.html',
searchUrl:'/tvfensearch/**----------fypage---.html',
searchable:2,//是否启用全局搜索,
quickSearch:0,//是否启用快速搜索,
filterable:0,//是否启用分类筛选,
headers:{//网站的请求头,完整支持所有的,常带ua和cookies
'User-Agent':'MOBILE_UA',
// "Cookie": "searchneed=ok"
},
class_name:'电影&电视剧&综艺&动漫',
class_url:'1&2&3&4',
play_parse:true,
lazy:'',
limit:6,
推荐:'.module-list;.module-items&&.module-item;a&&title;img&&data-src;.module-item-text&&Text;a&&href',
double:true, // 推荐内容是否双层定位
一级:'.module-items .module-item;a&&title;img&&data-src;.module-item-text&&Text;a&&href',
二级:{"title":"h1&&Text;.tag-link&&Text","img":".module-item-pic&&img&&data-src","desc":".video-info-items:eq(0)&&Text;.video-info-items:eq(3)&&Text;.video-info-items:eq(2)&&Text;.video-info-items:eq(1)&&Text","content":".vod_content&&Text","tabs":".module-tab-item","lists":".module-player-list:eq(#id)&&.scroll-content&&a"},
搜索:'.module-items .module-search-item;h3&&Text;img&&data-src;.video-serial&&Text;a&&href',
}
...@@ -2,9 +2,8 @@ muban.首图2.二级.tabs = '.stui-pannel__head&&h3'; ...@@ -2,9 +2,8 @@ muban.首图2.二级.tabs = '.stui-pannel__head&&h3';
var rule = Object.assign(muban.首图2,{ var rule = Object.assign(muban.首图2,{
title:'完美看看', title:'完美看看',
host:'https://www.wanmeikk.film', host:'https://www.wanmeikk.film',
class_parse:'.dropdown&&li;a&&Text;a&&href;.*/(.*?).html',
cate_exclude:'消息|专题',
url:'/category/fyclass-fypage.html', url:'/category/fyclass-fypage.html',
searchUrl:'/vodsearch/**-------------.html', searchUrl:'/so/-------------.html?wd=**&submit=',
class_name:'电影&美剧&韩剧&日剧&国产剧&动漫',//静态分类名称拼接 });
class_url:'1&2&3&4&5&6',//静态分类标识拼接 \ No newline at end of file
class_parse:'',
});
var rule={
title:'尘落影视',
host:'http://v.ftixkrv.cn',
url:'/whole/fyclass_______0_addtime_fypage.html',
searchUrl:'/?c=search&wd=**&sort=addtime&order=desc&page=fypage',
searchable:2,//是否启用全局搜索,
quickSearch:0,//是否启用快速搜索,
filterable:0,//是否启用分类筛选,
headers:{//网站的请求头,完整支持所有的,常带ua和cookies
'User-Agent':'PC_UA',
// "Cookie": "searchneed=ok"
},
class_name:'电影&电视剧&综艺&动漫',
class_url:'1&2&4&3',
cate_exclude:'全网资源',
play_parse:true,
lazy:'',
limit:6,
推荐:'.movie-item-in;a&&title;img&&src;em&&Text;a&&href',
一级:'.movie-item-in;a&&title;img&&src;em&&Text;a&&href',
二级:{"title":"h1&&Text;.table-striped tr:eq(2)&&Text","img":".img-thumbnail&&src","desc":";;.table-striped tr:eq(3)&&Text;.table-striped tr:eq(1)&&Text;.table-striped tr:eq(0)&&Text","content":".movie-introduce&&Text","tabs":".nav.nav-tabs li a","lists":".tab-pane.active:eq(#id) div a"},
搜索:'.movie-item-in;a&&title;img&&src;em&&Text;a&&href',
}
\ No newline at end of file
// 道长 drpy仓库 https://gitcode.net/qq_32394351/dr_py
// drpy安卓本地搭建说明 https://gitcode.net/qq_32394351/dr_py/-/blob/master/%E5%AE%89%E5%8D%93%E6%9C%AC%E5%9C%B0%E6%90%AD%E5%BB%BA%E8%AF%B4%E6%98%8E.md
// Pluto Player官方TG https://t.me/PlutoPlayer
// Pluto Player官方TG https://t.me/PlutoPlayerChannel
var rule = {
title:'抓饭体育',
host:'https://www.zhuafan.tech',
url:'/sports-home/category/fyclass',
class_name:'全部&足球&篮球&羽乒&台球&棒球&户外&搏击&综合&棋盘&电竞&网球&排球&聊天&原声',
class_url:'all&Football&Basketball&Badminton&Billiards&Baseball&Outdoors&Wrestling&Others&Boardgame&Popular&Tennis&Volleyball&Chat&Acoustic',
homeUrl:'/sports-home/category/all',//网站的首页链接,用于分类获取和推荐获取
detailUrl:'https://m.zhuafan.tech/fyid',//二级详情拼接链接(json格式用)
searchUrl:'/live-search/search/query/data?keyword=**&page=fypage&num=&searchType=all&uid=null&from=pc',
searchable:2,
quickSearch:0,
headers:{
'User-Agent':'PC_UA'
},
limit:6,
timeout:5000,
play_parse:true,
lazy:'',
double:false,
推荐:'*',
一级:'json:data;cname;imageUrl;uname;id',
二级:'*',
搜索:'json:cObj.cList;*;*;*;_id',
}
\ No newline at end of file
// 道长 drpy仓库 https://gitcode.net/qq_32394351/dr_py
// drpy安卓本地搭建说明 https://gitcode.net/qq_32394351/dr_py/-/blob/master/%E5%AE%89%E5%8D%93%E6%9C%AC%E5%9C%B0%E6%90%AD%E5%BB%BA%E8%AF%B4%E6%98%8E.md
// Pluto Player官方TG https://t.me/PlutoPlayer
// Pluto Player官方TG https://t.me/PlutoPlayerChannel
var rule = {
title:'斗鱼直播',
host:'https://www.douyu.com',
homeUrl:'/japi/weblist/apinc/recLabelList?',//网站的首页链接,用于分类获取和推荐获取
url:'/gapi/rkc/directory/mixList/fyclass/fypage',
class_name:'一起看&网游竞技&单机热游&手游休闲&娱乐天地&科技文化&语音互动&语音直播&正能量&颜值&音乐&舞蹈&二次元&户外&美食&互动交友&趣生活&数码科技&文化&科普&社会人文&汽车&纪录片&斗鱼购物&交友&电台&一起玩&音乐之声&正能量&英雄联盟&热门游戏&DOTA2&穿越火线&CFHD&DNF&炉石传说&CS:GO&逆战&lol云顶之弈&魔兽争霸&魔兽怀旧服&网易游戏&守望先锋&DOTA&魔兽世界&天涯明月刀&三国杀&主机游戏&永劫无间&生死狙击2&迷失ARK&艾尔登法环&逃离塔科夫&V Rising&海上狼人杀&怀旧游戏&王者荣耀&和平精英&火影忍者&LOL手游&金铲铲之战&重返帝国&COD手游&哈利波特:魔法觉醒&CF手游&欢乐斗地主&原神&天刀手游&棋牌娱乐&欢乐麻将&新游中心&QQ飞车&阴阳师&热门手游',
class_url:'2_208&1_1&1_15&1_9&1_2&1_11&1_20&1_18&1_13&2_201&2_175&2_1008&2_174&2_124&2_194&2_1555&2_1097&2_134&2_195&2_204&2_1162&2_136&2_514&2_1203&2_1221&2_1556&2_1575&2_910&2_250&2_1&2_270&2_3&2_33&2_1997&2_40&2_2&2_6&2_46&2_917&2_55&2_1055&2_3567&2_148&2_217&2_5&2_59&2_14&2_19&2_1227&2_1781&2_3528&2_3406&2_1024&2_3684&2_3556&2_26&2_181&2_350&2_196&2_1920&2_2556&2_2915&2_767&2_1192&2_178&2_416&2_1223&2_911&2_113&2_451&2_229&2_331&2_240&2_30',
detailUrl:'/fyid',//二级详情拼接链接(json格式用)
searchUrl:'/japi/search/api/searchShow?kw=**&page=fypage&pageSize=20',
searchable:2,
quickSearch:0,
headers:{
'User-Agent':'PC_UA'
},
timeout:5000,
limit:8,
play_parse:true,
lazy:'',
double:true,
推荐:'json:data.list;room;*;cover;*;*',
一级:'json:data.rl;rn;rs16;nn;rid',
二级:'*',
搜索:'json:data.relateShow;roomName;roomSrc;nickName;*',
}
\ No newline at end of file
var rule={
title:'爱迪影视',
host:'https://aidi.tv',
url:'/show/fyclass--------fypage---.html',
searchUrl:'/vsearch/-------------.html?wd=**&submit=',
searchable:2,
quickSearch:0,
filterable:0,
headers:{ 'User-Agent':'MOBILE_UA', },
class_name:'电影&电视剧&综艺&动漫',
class_url:'dianying&lianxuju&zongyi&dongman',
tab_exclude:'app专用|VIP线路',
play_parse:true,
double:true,
推荐:'body .vodlist.vodlist_wi;li;a&&title;.vodlist_thumb.lazyload&&data-original;.pic-text&&Text;a&&href',
一级:'.vodlist.vodlist_wi&&li;a&&title;.lazyload&&data-original;.pic-text&&Text;a&&href',
二级:{"title":"h2&&Text;.data:eq(1)&&Text","img":".lazyload&&data-original","desc":";.content_min li:eq(1)&&Text;;.content_min li:eq(2)&&Text;.content_min li:eq(3)&&Text;.data:eq(4)&&Text","content":".context.clearfix&&Text","tabs":".play_source_tab&&a","lists":".content_playlist:eq(#id) li"},
搜索:'.searchlist_img;a&&title;.vodlist_thumb.lazyload&&data-original;.pic-text&&Text;a&&href',
}
\ No newline at end of file
...@@ -3,7 +3,7 @@ var rule={ ...@@ -3,7 +3,7 @@ var rule={
host:'https://www.dandanzan10.top', host:'https://www.dandanzan10.top',
// homeUrl:'/', // homeUrl:'/',
url:'/fyclass/index_fypage.html[/fyclass/index.html]', url:'/fyclass/index_fypage.html[/fyclass/index.html]',
//searchUrl:'/search/**/', searchUrl:'/so/**-**--.html',
searchable:2,//是否启用全局搜索, searchable:2,//是否启用全局搜索,
quickSearch:0,//是否启用快速搜索, quickSearch:0,//是否启用快速搜索,
filterable:0,//是否启用分类筛选, filterable:0,//是否启用分类筛选,
...@@ -20,5 +20,5 @@ var rule={ ...@@ -20,5 +20,5 @@ var rule={
double:true, // 推荐内容是否双层定位 double:true, // 推荐内容是否双层定位
二级:{"title":"h1&&Text;.product-excerpt:eq(2)&&Text","img":".thumb&&src","desc":";;.product-excerpt:eq(3)&&Text;.product-excerpt:eq(1)&&Text;.product-excerpt:eq(0)&&Text","content":".product-excerpt:eq(5)&&Text","tabs":".playlists dl dt","lists":".play-div-oa:eq(#id) li"}, 二级:{"title":"h1&&Text;.product-excerpt:eq(2)&&Text","img":".thumb&&src","desc":";;.product-excerpt:eq(3)&&Text;.product-excerpt:eq(1)&&Text;.product-excerpt:eq(0)&&Text","content":".product-excerpt:eq(5)&&Text","tabs":".playlists dl dt","lists":".play-div-oa:eq(#id) li"},
搜索:'ul.img-list.clearfix&&li;a&&title;.lazyload&&data-original;.pic-text&&Text;a&&href', 搜索:'.lists-content&&ul&&li;*;*;*;*',
} }
\ No newline at end of file
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
import json import json
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from lxml import etree
from urllib.parse import urljoin from urllib.parse import urljoin
import re import re
from jsonpath import jsonpath from jsonpath import jsonpath
...@@ -22,7 +23,6 @@ class jsoup: ...@@ -22,7 +23,6 @@ class jsoup:
def pdfh(self,html,parse:str,add_url=False): def pdfh(self,html,parse:str,add_url=False):
if not parse: if not parse:
return '' return ''
doc = pq(html) doc = pq(html)
option = None option = None
if parse.find('&&') > -1: if parse.find('&&') > -1:
...@@ -66,10 +66,14 @@ class jsoup: ...@@ -66,10 +66,14 @@ class jsoup:
# ret = doc(parse) # 下面注释的写法不对的 # ret = doc(parse) # 下面注释的写法不对的
# ret = ret.find(':first') # ret = ret.find(':first')
# ret = ret.children(':first') # ret = ret.children(':first')
ret = str(ret) # print(parse)
# ret = str(ret)
ret = ret.outerHtml()
return ret return ret
def pdfa(self,html,parse:str): def pdfa(self,html,parse:str):
# 看官方文档才能解决这个问题!!!
# https://pyquery.readthedocs.io/en/latest/api.html
if not parse: if not parse:
return [] return []
if parse.find('&&') > -1: if parse.find('&&') > -1:
...@@ -78,8 +82,15 @@ class jsoup: ...@@ -78,8 +82,15 @@ class jsoup:
parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))]) parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
# print(f'pdfa:{parse}') # print(f'pdfa:{parse}')
doc = pq(html) doc = pq(html)
res = [str(item) for item in doc(parse).items()] result = doc(parse)
# 节点转字符串
# print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
# res = [item for item in result.items()]
res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误
# res = [str(item) for item in result.items()]
# res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
# print(len(res),res) # print(len(res),res)
# print('pdfa执行结果数:',len(res))
return res return res
def pd(self,html,parse:str): def pd(self,html,parse:str):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册