提交 38417287 编写于 作者: H hjdhnx

优化了pd系列函数

上级 db1394dc
3.7.5beta10
\ No newline at end of file
3.7.5beta9
\ No newline at end of file
此差异已折叠。
......@@ -425,47 +425,7 @@ var urljoin2 = urljoin;
const defaultParser = {
pdfh:pdfh,
pdfa:pdfa,
parseHikerToJq(parse,first){
// 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
first = first||false;
if(parse.includes('&&')){
parse = parse.split('&&'); //带&&的重新拼接
let new_parses = []; // 构造新的解析表达式列表
parse.forEach((it,i)=>{
let ps = it.split(' ').slice(-1)[0]; // 如果分割&&后带空格就取最后一个元素
if(!NOADD_INDEX.test(ps)){
if(!first&&i>=parse.length-1){
new_parses.push(it);
}else{
new_parses.push(`${it}:eq(0)`);
}
}else{
new_parses.push(it);
}
});
parse = new_parses.join(' ');
}else{
let ps = parse.split(' ').slice(-1)[0]; // 如果带空格就取最后一个元素
if(!NOADD_INDEX.test(ps) && first){
parse = `${parse}:eq(0)`;
}
}
return parse;
},
pd(html,parse,uri){
let ret = this.pdfh(html,parse);
if(typeof(uri)==='undefined'||!uri){
uri = '';
}
if(DOM_CHECK_ATTR.test(parse)){
if(/http/.test(ret)){
ret = ret.substr(ret.indexOf('http'));
}else{
ret = urljoin(MY_URL,ret)
}
}
return ret
},
pd:pd,
};
......@@ -599,36 +559,13 @@ const parseTags = {
},
},
jq:{
pdfh(html, parse, base_url) {
pdfh(html, parse) {
if (!html||!parse || !parse.trim()) {
return ''
}
parse = parse.trim();
let reparse = ['body&&Text','Text','body&&Html','Html'];
if(reparse.includes(reparse)){
return defaultParser.pdfh(html,parse)
}
let option = '';
if(parse.includes('&&')){
option = parse.split('&&').slice(-1)[0];
parse = parse.split('&&').slice(0,-1).join('&&');
}
parse = defaultParser.parseHikerToJq(parse, true);
let result = defaultParser.pdfh(html,parse,option);
if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
try {
result = result.match(/url\((.*?)\)/)[1];
// print(result);
}catch (e) {}
}
if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
if (/http/.test(result)) {
result = result.substr(result.indexOf('http'));
} else {
result = urljoin(base_url, result)
}
// print(result);
}
let result = defaultParser.pdfh(html,parse);
// print(`pdfh解析${parse}=>${result}`);
return result;
},
pdfa(html, parse) {
......@@ -636,14 +573,18 @@ const parseTags = {
return [];
}
parse = parse.trim();
parse = defaultParser.parseHikerToJq(parse)
let result = defaultParser.pdfa(html,parse);
// print(result);
print(`pdfa解析${parse}=>${result.length}`);
return result;
},
pd(html,parse,uri){
return parseTags.jq.pdfh(html, parse, MY_URL);
pd(html,parse,base_url){
if (!html||!parse || !parse.trim()) {
return ''
}
parse = parse.trim();
base_url = base_url||MY_URL;
return defaultParser.pd(html, parse, base_url);
},
},
getParse(p0){//非js开头的情况自动获取解析标签
......
此差异已折叠。
......@@ -160,7 +160,7 @@ class jsoup:
res = [item.outerHtml() for item in ret.items()]
return res
def pdfh(self, html, parse: str, add_url=False, base_url: str = ''):
def pdfh(self, html, parse: str, base_url: str = ''):
if not all([html, parse]):
return ''
if PARSE_CACHE:
......@@ -204,21 +204,21 @@ class jsoup:
except:
pass
if ret and add_url:
if ret and base_url:
need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
if need_add:
if 'http' in ret:
ret = ret[ret.find('http'):]
else:
if not base_url:
base_url = self.MY_URL
ret = urljoin(base_url, ret)
else:
ret = ret.outerHtml()
return ret
def pd(self, html, parse: str, base_url: str = ''):
return self.pdfh(html, parse, True, base_url)
if not base_url:
base_url = self.MY_URL
return self.pdfh(html, parse, base_url)
def pq(self, html: str):
return pq(html)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册