提交 467baa82 编写于 作者: H hjdhnx

drpy2

上级 1897b024
无法预览此类型文件
3.9.20beta9
\ No newline at end of file
3.9.20beta10
\ No newline at end of file
......@@ -575,27 +575,21 @@ const parseTags = {
return ''
}
parse = parse.trim();
let option = null;
let option = '';
print('pdfh parse前:'+parse);
if (parse.startsWith('body&&')) {
parse = parse.substr(6);
}
print('pdfh parse前:'+parse);
if (parse.indexOf('&&') > -1) {
if (parse.includes('&&')) {
let sp = parse.split('&&');
option = sp[sp.length - 1];
sp.splice(sp.length - 1);
sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it)){
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX.test(it) && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
if (!SELECT_REGEX.test(it)) {
sp[idex] = it+':eq(0)';
}
});
parse = sp.join(' ');
parse = sp.join(' ').trim();
}
if(parse === 'Text'){
parse = 'body';
......@@ -605,11 +599,18 @@ const parseTags = {
option = 'Html';
}
print('pdfh parse后:'+parse+',option:'+option);
let result = defaultParser.pdfh(html,parse,option);
print(result);
let result = defaultParser.pdfh(html,parse + " " + option);
// let result='';
// try {
// result = defaultParser.pdfh(html,parse + " " + option);
// }catch (e) {
// print('xxxxxxxxxxx');
// print('pdfh发生了错误');
// }
if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
try {
result = result.match(/url\((.*?)\)/)[1];
// print(result);
}catch (e) {}
}
if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
......@@ -618,6 +619,7 @@ const parseTags = {
} else {
result = urljoin(base_url, result)
}
// print(result);
}
return result;
},
......@@ -627,23 +629,23 @@ const parseTags = {
return [];
}
parse = parse.trim();
print('pdfa parse前:'+parse);
if (parse.indexOf('&&') > -1) {
print('pdfa=>parse前:'+parse);
if (parse.startsWith('body&&')) {
parse = parse.substr(6);
}
if (parse.includes('&&')) {
let sp = parse.split('&&');
sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it) && idex < sp.length - 1){
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1 && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1) {
sp[idex] = it+':eq(0)';
}
});
parse = sp.join(' ');
parse = sp.join(' ').trim();
}
print('pdfa parse后:'+parse);
// if(!/&&| /.test(parse)){ // 自动补body就是jsoup的无稽之谈
// parse = 'body '+parse;
// }
print('pdfa=>parse后:'+parse);
let result = defaultParser.pdfa(html,parse);
// print(result);
print(result.length);
......@@ -1197,7 +1199,7 @@ function homeVodParse(homeVodObj){
// print(vod);
d.push(vod);
} catch (e) {
console.log('首页列表处理发生错误:'+e.message);
console.log('首页列表双层定位处理发生错误:'+e.message);
}
}
......@@ -1249,7 +1251,7 @@ function homeVodParse(homeVodObj){
d.push(vod);
} catch (e) {
console.log('首页列表单层定位处理发生错误:'+e.message);
}
}
......
此差异已折叠。
......@@ -7,62 +7,50 @@ const parseTags = {
if (!parse || !parse.trim()) {
return ''
}
let eleFind = typeof html === 'object';
let option = undefined;
if (eleFind && parse.startsWith('body&&')) {
parse = parse.trim();
let option = null;
if (parse.startsWith('body&&')) {
parse = parse.substr(6);
if (parse.indexOf('&&') < 0) {
option = parse.trim();
parse = '*=*';
}
}
print('pdfh parse前:'+parse);
if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&');
option = sp[sp.length - 1];
sp.splice(sp.length - 1);
if (sp.length > 1) {
for (let i in sp) {
//Javascript自定义Array.prototype干扰for-in循环
if(sp.hasOwnProperty(i)){
if (!SELECT_REGEX.test(sp[i])) {
sp[i] = sp[i] + ':eq(0)';
}
sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it)){
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX.test(it) && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
}
} else {
if (!SELECT_REGEX.test(sp[0])) {
sp[0] = sp[0] + ':eq(0)';
}
}
});
parse = sp.join(' ');
}
let result = '';
const $ = eleFind ? html.rr : cheerio.load(html);
let ret = eleFind ? ((parse === '*=*' || $(html.ele).is(parse)) ? html.ele : $(html.ele).find(parse)) : $(parse);
if (option) {
if (option === 'Text') {
result = $(ret).text();
}
else if (option === 'Html') {
result = $(ret).html();
}
else {
result = $(ret).attr(option);
if(/style/.test(option.toLowerCase())&&/url\(/.test(result)){
try {
result = result.match(/url\((.*?)\)/)[1];
}catch (e) {}
}
}
if (result && base_url && DOM_CHECK_ATTR.test(option)) {
if (/http/.test(result)) {
result = result.substr(result.indexOf('http'));
} else {
result = urljoin(base_url, result)
}
if(parse === 'Text'){
parse = 'body';
option = 'Text';
}else if(parse === 'Html'){
parse = 'body';
option = 'Html';
}
print('pdfh parse后:'+parse+',option:'+option);
let result = defaultParser.pdfh(html,parse + " " + option);
print(result);
if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
try {
result = result.match(/url\((.*?)\)/)[1];
}catch (e) {}
}
if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
if (/http/.test(result)) {
result = result.substr(result.indexOf('http'));
} else {
result = urljoin(base_url, result)
}
} else {
result = $(ret).toString();
}
return result;
},
......@@ -71,34 +59,27 @@ const parseTags = {
print('!parse');
return [];
}
let eleFind = typeof html === 'object';
// print('parse前:'+parse);
parse = parse.trim();
print('pdfa parse前:'+parse);
if (parse.indexOf('&&') > -1) {
let sp = parse.split('&&');
for (let i in sp) {
if(sp.hasOwnProperty(i)){
if (!SELECT_REGEX_A.test(sp[i]) && i < sp.length - 1) {
if(sp[i]!=='body'){
// sp[i] = sp[i] + ':eq(0)';
sp[i] = sp[i] + ':first';
}
sp.forEach((it,idex)=>{
if(/:eq\((.*?)\)/.test(it) && idex < sp.length - 1){
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
sp[idex] = it;
}
}else if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1 && it!=='body') {
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
}
}
parse = sp.join(' ');
}
// print('parse后:'+parse);
const $ = eleFind ? html.rr : cheerio.load(html);
let ret = eleFind ? ($(html.ele).is(parse) ? html.ele : $(html.ele).find(parse)) : $(parse);
let result = [];
// print('outerHTML:');
// print($(ret[0]).prop("outerHTML"));
if (ret) {
ret.each(function (idx, ele) {
result.push({ rr: $, ele: ele });
// result.push({ rr: $, ele: $(ele).prop("outerHTML")}); // 性能贼差
});
parse = sp.join(' ');
}
print('pdfa parse后:'+parse);
let result = defaultParser.pdfa(html,parse);
// print(result);
print(result.length);
return result;
},
pd(html,parse,uri){
......
......@@ -19,8 +19,8 @@
"key":"dr_{{ rule.name }}",
"name":"{{ rule.name }}(drpy)",
"type":3,
#"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy2.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy2.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
#"api":{% if ISTVB and xr_mode %}"{{ host }}/libs/drpy.min.js"{% else %}"{{ host }}/libs/drpy.min.js"{% endif %},
#"api":"{{ host }}/libs/drpy.js",
"searchable": {{ rule.searchable }},
"quickSearch": {{ rule.quickSearch }},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册