diff --git "a/js/\351\270\255\345\245\210\351\243\236.js" "b/js/\351\270\255\345\245\210\351\243\236.js" index 13315da0c55669b1d353b11a5cf1470edf3b92fa..388673022288faa4db96d7fab1d47ab3db5de328 100644 --- "a/js/\351\270\255\345\245\210\351\243\236.js" +++ "b/js/\351\270\255\345\245\210\351\243\236.js" @@ -1,12 +1,13 @@ var rule = { title:'鸭奈飞', url:'https://yanetflix.com/vodshow/fyclass--------fypage---.html', + detailUrl:'https://yanetflix.com/voddetail/fyid.html', // url:'https://yanetflix.com/vodshow/', searchUrl:'/vodsearch/**----------fypage---.html', ua:'MOBILE_UA', class_name:'电影&连续剧&综艺&动漫', class_url:'dianying&lianxuju&zongyi&dongman', 一级:'body a.module-poster-item.module-item;a&&title;.lazyload&&data-original;.module-item-note&&Text;a&&href', - 二级:'', + 二级:{"title":"h1&&Text;.module-info-tag&&Text","img":".lazyload&&data-original","desc":".module-info-item:eq(1)&&Text;.module-info-item:eq(2)&&Text;.module-info-item:eq(3)&&Text","content":".module-info-introduction&&Text","tabs":".module-tab-item","lists":".module-play-list:eq(#id) a"}, 搜索:'', } \ No newline at end of file diff --git a/models/cms.py b/models/cms.py index be17985cf917588c1e43b44255e72d17e6a1de43..b175f7717f8dfdd8b8aca5872cc655e895c6b9e3 100644 --- a/models/cms.py +++ b/models/cms.py @@ -12,6 +12,7 @@ from utils.htmlParser import jsoup class CMS: def __init__(self,rule): self.url = rule.get('url','').rstrip('/') + self.detailUrl = rule.get('detailUrl','').rstrip('/') self.searchUrl = rule.get('searchUrl','') ua = rule.get('ua','') if ua == 'MOBILE_UA': @@ -101,8 +102,9 @@ class CMS: pdfh = jsp.pdfh pdfa = jsp.pdfa pd = jsp.pd - print(pdfh(r.text,p[0])) - + # print(pdfh(r.text,'body a.module-poster-item.module-item:eq(1)&&Text')) + # print(pdfh(r.text,'body a.module-poster-item.module-item:eq(0)')) + # print(pdfh(r.text,'body a.module-poster-item.module-item:first')) items = pdfa(r.text, p[0]) videos = [] for item in items: @@ -127,6 +129,95 @@ class CMS: result['total'] = 999999 return result + def detailContent(self, array): + """ + cms二级数据 + :param array: + :return: + """ + # video-info-header + fyid = array[0] + url = self.detailUrl.replace('fyid', fyid) + print(url) + headers = {'user-agent': self.ua} + r = requests.get(url, headers=headers) + html = r.text + # print(html) + p = self.二级 # 解析 + jsp = jsoup(self.url) + pdfh = jsp.pdfh + pdfa = jsp.pdfa + pd = jsp.pd + pq = jsp.pq + obj = {} + vod_name = '' + if p.get('title'): + p1 = p['title'].split(';') + vod_name = pdfh(html,p1[0]).replace('\n',' ') + title = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1]) + # print(title) + obj['title'] = title + if p.get('desc'): + p1 = p['desc'].split(';') + desc = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1]) + obj['desc'] = desc + + if p.get('content'): + p1 = p['content'].split(';') + content = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1]) + obj['content'] = content + + if p.get('img'): + p1 = p['img'].split(';') + img = '\n'.join([pdfh(html,i).replace('\n',' ') for i in p1]) + obj['img'] = img + + vod = { + "vod_id": fyid, + "vod_name": vod_name, + "vod_pic": obj.get('img',''), + "type_name": obj.get('title',''), + "vod_year": "", + "vod_area": "", + "vod_remarks": obj.get('desc',''), + "vod_actor": "", + "vod_director": "", + "vod_content": obj.get('content','') + } + + vod_play_from = '$$$' + playFrom = [] + if p.get('tabs'): + vodHeader = pdfa(html,p['tabs']) + vodHeader = [pq(v).text() for v in vodHeader] + else: + vodHeader = ['道长在线'] + + for v in vodHeader: + playFrom.append(v) + vod_play_from = vod_play_from.join(playFrom) + + vod_play_url = '$$$' + vod_tab_list = [] + if p.get('lists'): + for i in range(len(vodHeader)): + p1 = p['lists'].replace('#id',str(i)) + vodList = pdfa(html,p1) # 1条线路的选集列表 + vodList = [pq(i).text()+'$'+pd(i,'a&&href') for i in vodList] # 拼接成 名称$链接 + vlist = '#'.join(vodList) # 拼多个选集 + vod_tab_list.append(vlist) + vod_play_url = vod_play_url.join(vod_tab_list) + # print(vod_play_url) + vod['vod_play_from'] = vod_play_from + vod['vod_play_url'] = vod_play_url + + result = { + 'list': [ + vod + ] + } + return result + if __name__ == '__main__': from utils import parser js_path = f'js/鸭奈飞.js' @@ -134,5 +225,6 @@ if __name__ == '__main__': rule = ctx.eval('rule') cms = CMS(rule) print(cms.title) - print(cms.homeContent()) - cms.categoryContent('dianying',1) \ No newline at end of file + # print(cms.homeContent()) + # cms.categoryContent('dianying',1) + print(cms.detailContent(['67391'])) \ No newline at end of file diff --git a/utils/htmlParser.py b/utils/htmlParser.py index e2cd308770375c16ab50ddb5df82ea295638cccd..85739d66ba158a1a67fee38d9ff92f5da62b63ce 100644 --- a/utils/htmlParser.py +++ b/utils/htmlParser.py @@ -18,8 +18,8 @@ class jsoup: option = parse.split('&&')[1] parse = parse.split('&&')[0] - ret = doc(parse) if option: + ret = doc(parse) if option == 'Text': ret = ret.text() elif option == 'Html': @@ -29,16 +29,22 @@ class jsoup: if pd and option in ['url','src','href','data-original']: ret = urljoin(self.MY_URL,ret) else: - ret = ret.next() - print(ret) - ret = str(ret('fisrt')) + # ret = doc(parse+':first') + ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next + # ret = ret.next() # 取第一条数据 + # ret = doc(parse) # 下面注释的写法不对的 + # ret = ret.find(':first') + # ret = ret.children(':first') + ret = str(ret) return ret def pdfa(self,html,parse): doc = pq(html) - # print(doc(parse)[0]) # return [item.html() for item in doc(parse).items()] return [str(item) for item in doc(parse).items()] def pd(self,html,parse): - return self.pdfh(html,parse,True) \ No newline at end of file + return self.pdfh(html,parse,True) + + def pq(self,html): + return pq(html) \ No newline at end of file