diff --git a/utils/HtmlParser.java b/utils/HtmlParser.java new file mode 100644 index 0000000000000000000000000000000000000000..b8477908a9a74d61186b4ee62bf592afc6c064f3 --- /dev/null +++ b/utils/HtmlParser.java @@ -0,0 +1,294 @@ +package com.github.tvbox.osc.util.js; + +import android.text.TextUtils; + +import com.quickjs.android.JSUtils; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HtmlParser { + private static String pdfh_html = ""; + private static String pdfa_html = ""; + private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL); + private static final Pattern NOADD_INDEX = Pattern.compile(":eq|:lt|:gt|:first|:last|^body$|^#"); // 不自动加eq下标索引 + private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url)$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性 + private static Document pdfh_doc = null; + private static Document pdfa_doc = null; + + public static String joinUrl(String parent, String child) { + if (JSUtils.isEmpty(parent)) { + return child; + } + + URL url; + String q = parent; + try { + url = new URL(new URL(parent), child); + q = url.toExternalForm(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } +// if (q.contains("#")) { +// q = q.replaceAll("^(.+?)#.*?$", "$1"); +// } + return q; + } + + public static class Painfo { + public String nparse_rule; + public int nparse_index; + public List excludes; + } + + private static Painfo getParseInfo(String nparse) { + /* + 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作 + :param nparse: + :return:*/ + Painfo painfo = new Painfo(); + //List excludes = new ArrayList<>(); //定义排除列表默认值为空 + //int nparse_index; //定义位置索引默认值为0 + painfo.nparse_rule = nparse; //定义规则默认值为本身 + if (nparse.contains(":eq")) { + painfo.nparse_rule = nparse.split(":")[0]; + String nparse_pos = nparse.split(":")[1]; + + if (painfo.nparse_rule.contains("--")) { + String[] rules = painfo.nparse_rule.split("--"); + painfo.excludes = new ArrayList<>(Arrays.asList(rules)); + painfo.excludes.remove(0); + painfo.nparse_rule = rules[0]; + } else if (nparse_pos.contains("--")) { + String[] rules = nparse_pos.split("--"); + painfo.excludes = new ArrayList<>(Arrays.asList(rules)); + painfo.excludes.remove(0); + nparse_pos = rules[0]; + } + + try { + painfo.nparse_index = Integer.parseInt(nparse_pos.replace("eq(", "").replace(")", "")); + } catch (Exception e1) { + painfo.nparse_index = 0; + } + } else { + if (nparse.contains("--")) { + String[] rules = painfo.nparse_rule.split("--"); + painfo.excludes = new ArrayList<>(Arrays.asList(rules)); + painfo.excludes.remove(0); + painfo.nparse_rule = rules[0]; + } + } + return painfo; + } + + public static boolean isIndex(String str) { + if (JSUtils.isEmpty(str)) { + return false; + } + for (String str2 : new String[]{":eq", ":lt", ":gt", ":first", ":last", "body", "#"}) { + if (str.contains(str2)) { + if (str2.equals("body") || str2.equals("#")) { + return str.startsWith(str2); + } + return true; + } + } + return false; + } + + public static boolean isUrl(String str) { + if (JSUtils.isEmpty(str)) { + return false; + } + for (String str2 : new String[]{"url", "src", "href", "-original", "-play"}) { + if (str.contains(str2)) { + return true; + } + } + return false; + } + + private static String parseHikerToJq(String parse, boolean first) { + /* + 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0) + :param parse: + :param first: + :return: + */ + // 不自动加eq下标索引 + if (parse.contains("&&")) { + String[] parses = parse.split("&&"); //带&&的重新拼接 + List new_parses = new ArrayList<>(); //构造新的解析表达式列表 + for (int i = 0; i < parses.length; i++) { + String[] pss = parses[i].split(" "); + String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素 + Matcher m = NOADD_INDEX.matcher(ps); + //if (!isIndex(ps)) { + if (!m.find()) { + if (!first && i >= parses.length - 1) { //不传first且遇到最后一个,不用补eq(0) + new_parses.add(parses[i]); + } else { + new_parses.add(parses[i] + ":eq(0)"); + } + } else { + new_parses.add(parses[i]); + } + } + parse = TextUtils.join(" ", new_parses); + } else { + String[] pss = parse.split(" "); + String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素 + Matcher m = NOADD_INDEX.matcher(ps); + //if (!isIndex(ps) && first) { + if (!m.find() && first) { + parse = parse + ":eq(0)"; + } + } + return parse; + } + + public static String parseDomForUrl(String html, String rule, String add_url) { + if (!pdfh_html.equals(html)) { + pdfh_html = html; + pdfh_doc = Jsoup.parse(html); + } + Document doc = pdfh_doc; + if (rule.equals("body&&Text") || rule.equals("Text")) { + return doc.text(); + } else if (rule.equals("body&&Html") || rule.equals("Html")) { + return doc.html(); + } + String option = ""; + if (rule.contains("&&")) { + String[] rs = rule.split("&&"); + option = rs[rs.length - 1]; + List excludes = new ArrayList<>(Arrays.asList(rs)); + excludes.remove(rs.length - 1); + rule = TextUtils.join("&&", excludes); + } + rule = parseHikerToJq(rule, true); + String[] parses = rule.split(" "); + Elements ret = new Elements(); + for (String nparse : parses) { + ret = parseOneRule(doc, nparse, ret); + if (ret.isEmpty()) { + return ""; + } + } + String result; + if (JSUtils.isNotEmpty(option)) { + if (option.equals("Text")) { + result = ret.text(); + } else if (option.equals("Html")) { + result = ret.html(); + } else { + result = ret.attr(option); + if (option.toLowerCase().contains("style") && result.contains("url(")) { + Matcher m = p.matcher(result); + if (m.find()) { + result = m.group(1); + } + } + if (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(add_url)) { + // 需要自动urljoin的属性 + Matcher m = URLJOIN_ATTR.matcher(option); + //if (isUrl(option)) { + if (m.find()) { + if (result.contains("http")) { + result = result.substring(result.indexOf("http")); + } else { + result = joinUrl(add_url, result); + } + } + } + } + } else { + result = ret.outerHtml(); + } + return result; + + } + + public static List parseDomForArray(String html, String rule) { + if (!pdfa_html.equals(html)) { + pdfa_html = html; + pdfa_doc = Jsoup.parse(html); + } + Document doc = pdfa_doc; + rule = parseHikerToJq(rule, false); + String[] parses = rule.split(" "); + Elements ret = new Elements(); + for (String pars : parses) { + ret = parseOneRule(doc, pars, ret); + if (ret.isEmpty()) { + return new ArrayList<>(); + } + } + + List eleHtml = new ArrayList<>(); + for (int i = 0; i < ret.size(); i++) { + Element element1 = ret.get(i); + eleHtml.add(element1.outerHtml()); + } + return eleHtml; + } + + private static Elements parseOneRule(Document doc, String nparse, Elements ret) { + Painfo painfo = getParseInfo(nparse); + if (ret.isEmpty()) { + ret = doc.select(painfo.nparse_rule); + } else { + ret = ret.select(painfo.nparse_rule); + } + + if (nparse.contains(":eq")) { + if(painfo.nparse_index < 0){ + ret = ret.eq(ret.size() + painfo.nparse_index); + } else { + ret = ret.eq(painfo.nparse_index); + } + } + + if (painfo.excludes != null && !ret.isEmpty()) { + ret = ret.clone(); //克隆一个, 免得直接remove会影响doc的缓存 + for (int i = 0; i < painfo.excludes.size(); i++) { + ret.select(painfo.excludes.get(i)).remove(); + } + } + return ret; + } + + public static List parseDomForList(String html, String p1, String list_text, String list_url, String add_url) { + if (!pdfa_html.equals(html)) { + pdfa_html = html; + pdfa_doc = Jsoup.parse(html); + } + Document doc = pdfa_doc; + p1 = parseHikerToJq(p1, false); + String[] parses = p1.split(" "); + Elements ret = new Elements(); + for (String pars : parses) { + ret = parseOneRule(doc, pars, ret); + if (ret.isEmpty()) { + return new ArrayList<>(); + } + } + List new_vod_list = new ArrayList<>(); + for(int i = 0; i < ret.size(); i++){ + String it = ret.get(i).outerHtml(); + new_vod_list.add(parseDomForUrl(it, list_text, "").trim() + '$' + parseDomForUrl(it, list_url, add_url)); + } + return new_vod_list; + } +}