提交 0bc16f96 编写于 作者: H hjdhnx

尝试增加玄天源,看看怎么才能处理电影页的数据逻辑

上级 1506c165
无法预览此类型文件
...@@ -191,6 +191,7 @@ class CMS: ...@@ -191,6 +191,7 @@ class CMS:
'getParse':self.getParse, 'getParse':self.getParse,
'saveParse':self.saveParse, 'saveParse':self.saveParse,
'oheaders':self.oheaders, 'oheaders':self.oheaders,
'headers':self.headers, # 通用免嗅需要
'encoding':self.encoding, 'encoding':self.encoding,
'name':self.title, 'name':self.title,
'timeout':self.timeout, 'timeout':self.timeout,
...@@ -763,9 +764,171 @@ class CMS: ...@@ -763,9 +764,171 @@ class CMS:
return result return result
def 二级渲染(self,parse_str:'str|dict',**kwargs):
# *args是不定长参数 列表
# ** args是不定长参数字典
p = parse_str # 二级传递解析表达式 js的obj json对象
detailUrl = kwargs.get('detailUrl','') # 不定长字典传递的二级详情页vod_id原始数据
url = kwargs.get('url','') # 不定长字典传递的二级详情页链接智能拼接数据
vod = kwargs.get('vod',self.blank_vod()) # 最终要返回的二级详情页数据 默认空
html = kwargs.get('html','') # 不定长字典传递的源码(如果不传才会在下面程序中去获取)
show_name = kwargs.get('show_name','') # 是否显示来源(用于drpy区分)
jsp = kwargs.get('jsp','') # jsp = jsoup(self.url) 传递的jsp解析
fyclass = kwargs.get('fyclass','') # 二级传递的分类名称,可以得知进去的类别
if p == '*': # 解析表达式为*默认一级直接变播放
vod['vod_play_from'] = '道长在线'
vod['vod_remarks'] = detailUrl
vod['vod_actor'] = '没有二级,只有一级链接直接嗅探播放'
# vod['vod_content'] = url if not show_name else f'({self.id}) {url}'
vod['vod_content'] = url
vod['vod_play_url'] = '嗅探播放$' + self.play_url + url
elif not p or (not isinstance(p, dict) and not isinstance(p, str)) or (isinstance(p, str) and not str(p).startswith('js:')):
pass
else:
is_json = p.get('is_json', False) if isinstance(p, dict) else False # 二级里加is_json参数
pdfh = jsp.pjfh if is_json else jsp.pdfh
pdfa = jsp.pjfa if is_json else jsp.pdfa
pd = jsp.pj if is_json else jsp.pd
pq = jsp.pq
obj = {}
vod_name = ''
if not html: # 没传递html参数接下来智能获取
r = requests.get(url, headers=self.headers, timeout=self.timeout)
html = self.checkHtml(r)
if is_json:
html = self.dealJson(html)
html = json.loads(html)
if p.get('title'):
p1 = p['title'].split(';')
vod_name = pdfh(html, p1[0]).replace('\n', ' ')
# title = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1])
title = '\n'.join([','.join([pdfh(html, pp1).strip() for pp1 in i.split('+')]) for i in p1])
# print(title)
obj['title'] = title
if p.get('desc'):
try:
p1 = p['desc'].split(';')
desc = '\n'.join([pdfh(html, i).replace('\n', ' ') for i in p1])
obj['desc'] = desc
except:
pass
if p.get('content'):
p1 = p['content'].split(';')
try:
content = '\n'.join([pdfh(html, i).replace('\n', ' ') for i in p1])
obj['content'] = content
except:
pass
if p.get('img'):
p1 = p['img']
try:
img = pd(html, p1)
obj['img'] = img
except Exception as e:
logger.info(f'二级图片定位失败,但不影响使用{e}')
vod = {
"vod_id": detailUrl,
"vod_name": vod_name,
"vod_pic": obj.get('img', ''),
"type_name": obj.get('title', ''),
"vod_year": "",
"vod_area": "",
"vod_remarks": obj.get('desc', ''),
"vod_actor": "",
"vod_director": "",
"vod_content": obj.get('content', '')
}
vod_play_from = '$$$'
playFrom = []
if p.get('重定向') and str(p['重定向']).startswith('js:'):
headers['Referer'] = getHome(url)
py_ctx.update({
'input': url,
'html': html,
'TYPE': 'detail', # 海阔js环境标志
'cateID': fyclass, # 当前分类
'oheaders': self.d.oheaders,
'fetch_params': {'headers': self.headers, 'timeout': self.d.timeout, 'encoding': self.d.encoding},
'd': self.d,
'getParse': self.d.getParse,
'saveParse': self.d.saveParse,
'jsp': jsp, 'setDetail': setDetail,
})
ctx = py_ctx
# print(ctx)
rcode = p['重定向'].replace('js:', '', 1)
jscode = getPreJs() + rcode
# print(jscode)
loader, _ = runJScode(jscode, ctx=ctx)
# print(loader.toString())
logger.info(f'开始执行二级重定向代码:{rcode}')
html = loader.eval('html')
if isinstance(vod, JsObjectWrapper):
html = str(html)
if p.get('tabs'):
vodHeader = []
# print(p['tabs'].split(';')[0])
vHeader = pdfa(html, p['tabs'].split(';')[0])
# print(f'线路列表数:{len((vodHeader))}')
# print(vodHeader)
if not is_json:
for v in vHeader:
# 过滤排除掉线路标题
v_title = pq(v).text()
if self.tab_exclude and jsp.test(self.tab_exclude, v_title):
continue
vodHeader.append(v_title)
else:
vodHeader = vHeader
else:
vodHeader = ['道长在线']
# print(vodHeader)
for v in vodHeader:
playFrom.append(v)
vod_play_from = vod_play_from.join(playFrom)
vod_play_url = '$$$'
vod_tab_list = []
if p.get('lists'):
for i in range(len(vodHeader)):
tab_name = str(vodHeader[i])
tab_ext = p['tabs'].split(';')[1] if len(p['tabs'].split(';')) > 1 else ''
p1 = p['lists'].replace('#idv', tab_name).replace('#id', str(i))
tab_ext = tab_ext.replace('#idv', tab_name).replace('#id', str(i))
vodList = pdfa(html, p1) # 1条线路的选集列表
# print(vodList)
# vodList = [pq(i).text()+'$'+pd(i,'a&&href') for i in vodList] # 拼接成 名称$链接
if self.play_parse: # 自动base64编码
vodList = [(pdfh(html, tab_ext) if tab_ext else tab_name) + '$' + self.play_url + encodeUrl(i) for i
in vodList] if is_json else \
[pq(i).text() + '$' + self.play_url + encodeUrl(pd(i, 'a&&href')) for i in vodList] # 拼接成 名称$链接
else:
vodList = [(pdfh(html, tab_ext) if tab_ext else tab_name) + '$' + self.play_url + i for i in
vodList] if is_json else \
[pq(i).text() + '$' + self.play_url + pd(i, 'a&&href') for i in vodList] # 拼接成 名称$链接
vlist = '#'.join(vodList) # 拼多个选集
vod_tab_list.append(vlist)
vod_play_url = vod_play_url.join(vod_tab_list)
# print(vod_play_url)
vod['vod_play_from'] = vod_play_from
# print(vod_play_from)
vod['vod_play_url'] = vod_play_url
if show_name:
vod['vod_content'] = f'({self.id}){vod.get("vod_content", "")}'
return vod
def detailOneVod(self,id,fyclass='',show_name=False): def detailOneVod(self,id,fyclass='',show_name=False):
vod = self.blank_vod()
detailUrl = str(id) detailUrl = str(id)
vod = {}
if not detailUrl.startswith('http') and not '/' in detailUrl: if not detailUrl.startswith('http') and not '/' in detailUrl:
url = self.detailUrl.replace('fyid', detailUrl).replace('fyclass',fyclass) url = self.detailUrl.replace('fyid', detailUrl).replace('fyclass',fyclass)
elif '/' in detailUrl: elif '/' in detailUrl:
...@@ -775,32 +938,14 @@ class CMS: ...@@ -775,32 +938,14 @@ class CMS:
logger.info(f'进入详情页: {url}') logger.info(f'进入详情页: {url}')
try: try:
p = self.二级 # 解析 p = self.二级 # 解析
if p == '*':
vod = self.blank_vod()
vod['vod_play_from'] = '道长在线'
vod['vod_remarks'] = detailUrl
vod['vod_actor'] = '没有二级,只有一级链接直接嗅探播放'
vod['vod_content'] = url if not show_name else f'({self.id}) {url}'
vod['vod_play_url'] = '嗅探播放$'+self.play_url+url
print(vod)
return vod
if not p:
return vod
if not isinstance(p,dict) and not isinstance(p,str):
return vod
if isinstance(p,str) and not str(p).startswith('js:'):
return vod
jsp = jsoup(self.url) jsp = jsoup(self.url)
is_json = p.get('is_json',False) if isinstance(p,dict) else False # 二级里加is_json参数
is_js = isinstance(p,str) and str(p).startswith('js:') # 是js is_js = isinstance(p,str) and str(p).startswith('js:') # 是js
if is_js: if is_js:
headers['Referer'] = getHome(url) headers['Referer'] = getHome(url)
py_ctx.update({ py_ctx.update({
'input': url, 'input': url,
'TYPE': 'detail', # 海阔js环境标志 'TYPE': 'detail', # 海阔js环境标志
'二级': self.二级渲染, # 二级解析函数,可以解析dict
'cateID': fyclass, # 当前分类 'cateID': fyclass, # 当前分类
'oheaders': self.d.oheaders, 'oheaders': self.d.oheaders,
'fetch_params': {'headers': self.headers, 'timeout': self.d.timeout, 'encoding': self.d.encoding}, 'fetch_params': {'headers': self.headers, 'timeout': self.d.timeout, 'encoding': self.d.encoding},
...@@ -818,148 +963,15 @@ class CMS: ...@@ -818,148 +963,15 @@ class CMS:
vod = loader.eval('vod') vod = loader.eval('vod')
if isinstance(vod,JsObjectWrapper): if isinstance(vod,JsObjectWrapper):
vod = vod.to_dict() vod = vod.to_dict()
if show_name:
vod['vod_content'] = f'({self.id}){vod.get("vod_content", "")}'
else: else:
vod = {} vod = self.blank_vod()
# print(type(vod))
# print(vod)
else: else:
pdfh = jsp.pjfh if is_json else jsp.pdfh vod = self.二级渲染(p,detailUrl=detailUrl,url=url,vod=vod,show_name=show_name,jsp=jsp,fyclass=fyclass)
pdfa = jsp.pjfa if is_json else jsp.pdfa
pd = jsp.pj if is_json else jsp.pd
pq = jsp.pq
obj = {}
vod_name = ''
r = requests.get(url, headers=self.headers, timeout=self.timeout)
html = self.checkHtml(r)
if is_json:
html = self.dealJson(html)
html = json.loads(html)
if p.get('title'):
p1 = p['title'].split(';')
vod_name = pdfh(html,p1[0]).replace('\n',' ')
# title = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1])
title = '\n'.join([','.join([pdfh(html, pp1).strip() for pp1 in i.split('+')]) for i in p1])
# print(title)
obj['title'] = title
if p.get('desc'):
try:
p1 = p['desc'].split(';')
desc = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1])
obj['desc'] = desc
except:
pass
if p.get('content'):
p1 = p['content'].split(';')
try:
content = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1])
obj['content'] = content
except:
pass
if p.get('img'):
p1 = p['img']
try:
img = pd(html,p1)
obj['img'] = img
except Exception as e:
logger.info(f'二级图片定位失败,但不影响使用{e}')
vod = {
"vod_id": detailUrl,
"vod_name": vod_name,
"vod_pic": obj.get('img',''),
"type_name": obj.get('title',''),
"vod_year": "",
"vod_area": "",
"vod_remarks": obj.get('desc',''),
"vod_actor": "",
"vod_director": "",
"vod_content": obj.get('content','')
}
vod_play_from = '$$$'
playFrom = []
if p.get('重定向') and str(p['重定向']).startswith('js:'):
headers['Referer'] = getHome(url)
py_ctx.update({
'input': url,
'html': html,
'TYPE': 'detail', # 海阔js环境标志
'cateID': fyclass, # 当前分类
'oheaders': self.d.oheaders,
'fetch_params': {'headers': self.headers, 'timeout': self.d.timeout, 'encoding': self.d.encoding},
'd': self.d,
'getParse': self.d.getParse,
'saveParse': self.d.saveParse,
'jsp': jsp, 'setDetail': setDetail,
})
ctx = py_ctx
# print(ctx)
rcode = p['重定向'].replace('js:', '', 1)
jscode = getPreJs() + rcode
# print(jscode)
loader, _ = runJScode(jscode, ctx=ctx)
# print(loader.toString())
logger.info(f'开始执行二级重定向代码:{rcode}')
html = loader.eval('html')
if isinstance(vod, JsObjectWrapper):
html = str(html)
if p.get('tabs'):
vodHeader = []
# print(p['tabs'].split(';')[0])
vHeader = pdfa(html,p['tabs'].split(';')[0])
# print(f'线路列表数:{len((vodHeader))}')
# print(vodHeader)
if not is_json:
for v in vHeader:
# 过滤排除掉线路标题
v_title = pq(v).text()
if self.tab_exclude and jsp.test(self.tab_exclude, v_title):
continue
vodHeader.append(v_title)
else:
vodHeader = vHeader
else:
vodHeader = ['道长在线']
# print(vodHeader)
for v in vodHeader:
playFrom.append(v)
vod_play_from = vod_play_from.join(playFrom)
vod_play_url = '$$$'
vod_tab_list = []
if p.get('lists'):
for i in range(len(vodHeader)):
tab_name = str(vodHeader[i])
tab_ext = p['tabs'].split(';')[1] if len(p['tabs'].split(';')) > 1 else ''
p1 = p['lists'].replace('#idv',tab_name).replace('#id',str(i))
tab_ext = tab_ext.replace('#idv',tab_name).replace('#id',str(i))
vodList = pdfa(html,p1) # 1条线路的选集列表
# print(vodList)
# vodList = [pq(i).text()+'$'+pd(i,'a&&href') for i in vodList] # 拼接成 名称$链接
if self.play_parse: # 自动base64编码
vodList = [(pdfh(html,tab_ext) if tab_ext else tab_name)+'$'+self.play_url+encodeUrl(i) for i in vodList] if is_json else\
[pq(i).text()+'$'+self.play_url+encodeUrl(pd(i,'a&&href')) for i in vodList] # 拼接成 名称$链接
else:
vodList = [(pdfh(html, tab_ext) if tab_ext else tab_name) + '$' + self.play_url + i for i in
vodList] if is_json else \
[pq(i).text() + '$' + self.play_url + pd(i, 'a&&href') for i in vodList] # 拼接成 名称$链接
vlist = '#'.join(vodList) # 拼多个选集
vod_tab_list.append(vlist)
vod_play_url = vod_play_url.join(vod_tab_list)
# print(vod_play_url)
vod['vod_play_from'] = vod_play_from
# print(vod_play_from)
vod['vod_play_url'] = vod_play_url
# print(vod_play_url)
except Exception as e: except Exception as e:
logger.info(f'{self.getName()}获取单个详情页{detailUrl}出错{e}') logger.info(f'{self.getName()}获取单个详情页{detailUrl}出错{e}')
if show_name:
vod['vod_content'] = f'({self.id}){vod.get("vod_content","")}'
return vod return vod
def detailContent(self, fypage, array,show_name=False): def detailContent(self, fypage, array,show_name=False):
......
...@@ -68,7 +68,8 @@ var mubanDict = { // 模板字典 ...@@ -68,7 +68,8 @@ var mubanDict = { // 模板字典
'User-Agent':'UC_UA', 'User-Agent':'UC_UA',
// "Cookie": "" // "Cookie": ""
}, },
class_parse:'.stui-header__menu li:gt(0):lt(7);a&&Text;a&&href;/(\\d+).html', // class_parse:'.stui-header__menu li:gt(0):lt(7);a&&Text;a&&href;/(\\d+).html',
class_parse:'.stui-header__menu li:gt(0):lt(7);a&&Text;a&&href;.*/(.*?).html',
play_parse:true, play_parse:true,
lazy:'', lazy:'',
limit:6, limit:6,
...@@ -76,7 +77,9 @@ var mubanDict = { // 模板字典 ...@@ -76,7 +77,9 @@ var mubanDict = { // 模板字典
double:true, // 推荐内容是否双层定位 double:true, // 推荐内容是否双层定位
一级:'.stui-vodlist li;a&&title;a&&data-original;.pic-text&&Text;a&&href', 一级:'.stui-vodlist li;a&&title;a&&data-original;.pic-text&&Text;a&&href',
二级:{"title":".stui-content__detail .title&&Text;.stui-content__detail p:eq(-2)&&Text","img":".stui-content__thumb .lazyload&&data-original","desc":".stui-content__detail p:eq(0)&&Text;.stui-content__detail p:eq(1)&&Text;.stui-content__detail p:eq(2)&&Text","content":".detail&&Text","tabs":".stui-vodlist__head h3","lists":".stui-content__playlist:eq(#id) li"}, 二级:{"title":".stui-content__detail .title&&Text;.stui-content__detail p:eq(-2)&&Text","img":".stui-content__thumb .lazyload&&data-original","desc":".stui-content__detail p:eq(0)&&Text;.stui-content__detail p:eq(1)&&Text;.stui-content__detail p:eq(2)&&Text","content":".detail&&Text","tabs":".stui-vodlist__head h3","lists":".stui-content__playlist:eq(#id) li"},
搜索:'#searchList li;a&&title;.lazyload&&data-original;.text-muted&&Text;a&&href;.text-muted:eq(-1)&&Text', // 搜索:'#searchList li;a&&title;.lazyload&&data-original;.text-muted&&Text;a&&href;.text-muted:eq(-1)&&Text',
搜索:'ul.stui-vodlist&&li;a&&title;.lazyload&&data-original;.text-muted&&Text;a&&href;.text-muted:eq(-1)&&Text',
// 搜索:'ul.stui-vodlist__media&&li;a&&title;.lazyload&&data-original;.text-muted&&Text;a&&href;.text-muted:eq(-1)&&Text',
}, },
vfed:{ vfed:{
title:'', title:'',
......
muban.首图2.二级.tabs = '.dropdown-menu li';
var rule = Object.assign(muban.首图2,{
title:'玄天',
host:'https://m.7caa.com',
url:'/list/fyclass-fypage.html',
searchUrl:'/search/**----------fypage---.html',
lazy:'通用免嗅'
});
\ No newline at end of file
...@@ -7,4 +7,5 @@ searchUrl:'/vodsearch**/page/fypage.html', ...@@ -7,4 +7,5 @@ searchUrl:'/vodsearch**/page/fypage.html',
class_name:'电视剧&电影&综艺&动漫&纪录片',//静态分类名称拼接 class_name:'电视剧&电影&综艺&动漫&纪录片',//静态分类名称拼接
class_url:'dianshiju&dianying&zongyi&dongman&jilupian',//静态分类标识拼接 class_url:'dianshiju&dianying&zongyi&dongman&jilupian',//静态分类标识拼接
class_parse:' ', class_parse:' ',
搜索:'ul.stui-vodlist__media&&li;a&&title;.lazyload&&data-original;.text-muted&&Text;a&&href;.text-muted:eq(-1)&&Text',
}); });
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册