提交 467baa82 编写于 作者: H hjdhnx

drpy2

上级 1897b024
无法预览此类型文件
3.9.20beta9 3.9.20beta10
\ No newline at end of file \ No newline at end of file
...@@ -575,27 +575,21 @@ const parseTags = { ...@@ -575,27 +575,21 @@ const parseTags = {
return '' return ''
} }
parse = parse.trim(); parse = parse.trim();
let option = null; let option = '';
print('pdfh parse前:'+parse);
if (parse.startsWith('body&&')) { if (parse.startsWith('body&&')) {
parse = parse.substr(6); parse = parse.substr(6);
} }
print('pdfh parse前:'+parse); if (parse.includes('&&')) {
if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&'); let sp = parse.split('&&');
option = sp[sp.length - 1]; option = sp[sp.length - 1];
sp.splice(sp.length - 1); sp.splice(sp.length - 1);
sp.forEach((it,idex)=>{ sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it)){ if (!SELECT_REGEX.test(it)) {
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]); sp[idex] = it+':eq(0)';
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX.test(it) && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
} }
}); });
parse = sp.join(' '); parse = sp.join(' ').trim();
} }
if(parse === 'Text'){ if(parse === 'Text'){
parse = 'body'; parse = 'body';
...@@ -605,11 +599,18 @@ const parseTags = { ...@@ -605,11 +599,18 @@ const parseTags = {
option = 'Html'; option = 'Html';
} }
print('pdfh parse后:'+parse+',option:'+option); print('pdfh parse后:'+parse+',option:'+option);
let result = defaultParser.pdfh(html,parse,option); let result = defaultParser.pdfh(html,parse + " " + option);
print(result); // let result='';
// try {
// result = defaultParser.pdfh(html,parse + " " + option);
// }catch (e) {
// print('xxxxxxxxxxx');
// print('pdfh发生了错误');
// }
if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){ if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
try { try {
result = result.match(/url\((.*?)\)/)[1]; result = result.match(/url\((.*?)\)/)[1];
// print(result);
}catch (e) {} }catch (e) {}
} }
if (result && base_url && option && DOM_CHECK_ATTR.test(option)) { if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
...@@ -618,6 +619,7 @@ const parseTags = { ...@@ -618,6 +619,7 @@ const parseTags = {
} else { } else {
result = urljoin(base_url, result) result = urljoin(base_url, result)
} }
// print(result);
} }
return result; return result;
}, },
...@@ -627,23 +629,23 @@ const parseTags = { ...@@ -627,23 +629,23 @@ const parseTags = {
return []; return [];
} }
parse = parse.trim(); parse = parse.trim();
print('pdfa parse前:'+parse); print('pdfa=>parse前:'+parse);
if (parse.indexOf('&&') > -1) { if (parse.startsWith('body&&')) {
parse = parse.substr(6);
}
if (parse.includes('&&')) {
let sp = parse.split('&&'); let sp = parse.split('&&');
sp.forEach((it,idex)=>{ sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it) && idex < sp.length - 1){ if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1) {
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]); sp[idex] = it+':eq(0)';
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1 && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
} }
}); });
parse = sp.join(' '); parse = sp.join(' ').trim();
} }
print('pdfa parse后:'+parse); // if(!/&&| /.test(parse)){ // 自动补body就是jsoup的无稽之谈
// parse = 'body '+parse;
// }
print('pdfa=>parse后:'+parse);
let result = defaultParser.pdfa(html,parse); let result = defaultParser.pdfa(html,parse);
// print(result); // print(result);
print(result.length); print(result.length);
...@@ -1197,7 +1199,7 @@ function homeVodParse(homeVodObj){ ...@@ -1197,7 +1199,7 @@ function homeVodParse(homeVodObj){
// print(vod); // print(vod);
d.push(vod); d.push(vod);
} catch (e) { } catch (e) {
console.log('首页列表处理发生错误:'+e.message); console.log('首页列表双层定位处理发生错误:'+e.message);
} }
} }
...@@ -1249,7 +1251,7 @@ function homeVodParse(homeVodObj){ ...@@ -1249,7 +1251,7 @@ function homeVodParse(homeVodObj){
d.push(vod); d.push(vod);
} catch (e) { } catch (e) {
console.log('首页列表单层定位处理发生错误:'+e.message);
} }
} }
......
此差异已折叠。
...@@ -7,62 +7,50 @@ const parseTags = { ...@@ -7,62 +7,50 @@ const parseTags = {
if (!parse || !parse.trim()) { if (!parse || !parse.trim()) {
return '' return ''
} }
let eleFind = typeof html === 'object'; parse = parse.trim();
let option = undefined; let option = null;
if (eleFind && parse.startsWith('body&&')) { if (parse.startsWith('body&&')) {
parse = parse.substr(6); parse = parse.substr(6);
if (parse.indexOf('&&') < 0) {
option = parse.trim();
parse = '*=*';
}
} }
print('pdfh parse前:'+parse);
if (parse.indexOf('&&') > -1) { if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&'); let sp = parse.split('&&');
option = sp[sp.length - 1]; option = sp[sp.length - 1];
sp.splice(sp.length - 1); sp.splice(sp.length - 1);
if (sp.length > 1) { sp.forEach((it,idex)=>{
for (let i in sp) { if(/:eq\((.*?)\)/.test(it)){
//Javascript自定义Array.prototype干扰for-in循环 let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(sp.hasOwnProperty(i)){ if(pos >= 0 ){ // jsoup的eq 正整数从1开始
if (!SELECT_REGEX.test(sp[i])) { it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[i] = sp[i] + ':eq(0)'; sp[idex] = it;
}
} }
}else if (!SELECT_REGEX.test(it) && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
} }
} else { });
if (!SELECT_REGEX.test(sp[0])) {
sp[0] = sp[0] + ':eq(0)';
}
}
parse = sp.join(' '); parse = sp.join(' ');
} }
let result = ''; if(parse === 'Text'){
const $ = eleFind ? html.rr : cheerio.load(html); parse = 'body';
let ret = eleFind ? ((parse === '*=*' || $(html.ele).is(parse)) ? html.ele : $(html.ele).find(parse)) : $(parse); option = 'Text';
if (option) { }else if(parse === 'Html'){
if (option === 'Text') { parse = 'body';
result = $(ret).text(); option = 'Html';
} }
else if (option === 'Html') { print('pdfh parse后:'+parse+',option:'+option);
result = $(ret).html(); let result = defaultParser.pdfh(html,parse + " " + option);
} print(result);
else { if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
result = $(ret).attr(option); try {
if(/style/.test(option.toLowerCase())&&/url\(/.test(result)){ result = result.match(/url\((.*?)\)/)[1];
try { }catch (e) {}
result = result.match(/url\((.*?)\)/)[1]; }
}catch (e) {} if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
} if (/http/.test(result)) {
} result = result.substr(result.indexOf('http'));
if (result && base_url && DOM_CHECK_ATTR.test(option)) { } else {
if (/http/.test(result)) { result = urljoin(base_url, result)
result = result.substr(result.indexOf('http'));
} else {
result = urljoin(base_url, result)
}
} }
} else {
result = $(ret).toString();
} }
return result; return result;
}, },
...@@ -71,34 +59,27 @@ const parseTags = { ...@@ -71,34 +59,27 @@ const parseTags = {
print('!parse'); print('!parse');
return []; return [];
} }
let eleFind = typeof html === 'object'; parse = parse.trim();
// print('parse前:'+parse); print('pdfa parse前:'+parse);
if (parse.indexOf('&&') > -1) { if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&'); let sp = parse.split('&&');
for (let i in sp) { sp.forEach((it,idex)=>{
if(sp.hasOwnProperty(i)){ if(/:eq\((.*?)\)/.test(it) && idex < sp.length - 1){
if (!SELECT_REGEX_A.test(sp[i]) && i < sp.length - 1) { let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(sp[i]!=='body'){ if(pos >= 0 ){ // jsoup的eq 正整数从1开始
// sp[i] = sp[i] + ':eq(0)'; it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[i] = sp[i] + ':first'; sp[idex] = it;
}
} }
}else if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1 && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
} }
}
parse = sp.join(' ');
}
// print('parse后:'+parse);
const $ = eleFind ? html.rr : cheerio.load(html);
let ret = eleFind ? ($(html.ele).is(parse) ? html.ele : $(html.ele).find(parse)) : $(parse);
let result = [];
// print('outerHTML:');
// print($(ret[0]).prop("outerHTML"));
if (ret) {
ret.each(function (idx, ele) {
result.push({ rr: $, ele: ele });
// result.push({ rr: $, ele: $(ele).prop("outerHTML")}); // 性能贼差
}); });
parse = sp.join(' ');
} }
print('pdfa parse后:'+parse);
let result = defaultParser.pdfa(html,parse);
// print(result);
print(result.length);
return result; return result;
}, },
pd(html,parse,uri){ pd(html,parse,uri){
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
"key":"dr_{{ rule.name }}", "key":"dr_{{ rule.name }}",
"name":"{{ rule.name }}(drpy)", "name":"{{ rule.name }}(drpy)",
"type":3, "type":3,
#"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy2.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %}, "api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy2.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %}, #"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
#"api":"{{ host }}/libs/drpy.js", #"api":"{{ host }}/libs/drpy.js",
"searchable": {{ rule.searchable }}, "searchable": {{ rule.searchable }},
"quickSearch": {{ rule.quickSearch }}, "quickSearch": {{ rule.quickSearch }},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册