Main.java 6.8 KB
Newer Older
1 2
package com.github.weiranyi;

亦蔚然's avatar
亦蔚然 已提交
3
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
4 5 6 7 8 9
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
亦蔚然's avatar
亦蔚然 已提交
10 11 12
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
13 14

import java.io.IOException;
15
import java.sql.*;
亦蔚然's avatar
亦蔚然 已提交
16
import java.util.ArrayList;
亦蔚然's avatar
亦蔚然 已提交
17

亦蔚然's avatar
亦蔚然 已提交
18
import java.util.List;
亦蔚然's avatar
亦蔚然 已提交
19

20 21

public class Main {
亦蔚然's avatar
亦蔚然 已提交
22 23 24 25
    private static final String USER_NAME = "root";
    private static final String USER_PASSWORD = "123456";

    @SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD")
26 27
    public static void main(String[] args) throws IOException, SQLException {
        // 创建一个数据库链接
亦蔚然's avatar
亦蔚然 已提交
28
        Connection connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD);
29

亦蔚然's avatar
亦蔚然 已提交
30
        while (true) {
亦蔚然's avatar
亦蔚然 已提交
31 32
            // 【待处理】存放待处理的链接的池子
            List<String> linkPool = loadUrlsFromDatabase(connection, "select link from LINKS_TO_BE_PROCESSED;");
亦蔚然's avatar
亦蔚然 已提交
33 34 35 36 37 38
            // 链接池是空的就退出循环
            if (linkPool.isEmpty()) {
                break;
            }
            // 获取并移除最后一个链接,对于ArrayList来说更有效率
            String link = linkPool.remove(linkPool.size() - 1);
亦蔚然's avatar
亦蔚然 已提交
39
            insertIntoDatabase(connection, link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?");
亦蔚然's avatar
亦蔚然 已提交
40 41

            // 若链接已经处理过了就跳到下一次循环
亦蔚然's avatar
亦蔚然 已提交
42
            if (isLinkProcessed(connection, link)) {
亦蔚然's avatar
亦蔚然 已提交
43 44 45
                continue;
            }
            // 判断是否是感兴趣滴内容【新浪站内的网页】
亦蔚然's avatar
亦蔚然 已提交
46 47
            if (isInterestingLink(link)) {
                Document doc = httpGetAndParseHtml(link);
亦蔚然's avatar
亦蔚然 已提交
48 49
                // 分析页面url将它们放到即将处理的url池子中去
                parseUrlsFromAndStoreIntoDatabase(connection, doc);
亦蔚然's avatar
亦蔚然 已提交
50
                storeIntoDatabaseIfItIsNewsPage(doc);
亦蔚然's avatar
亦蔚然 已提交
51
                insertIntoDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
亦蔚然's avatar
亦蔚然 已提交
52 53 54
            } else {
                // 不感兴趣
                continue;
55 56
            }
        }
57 58 59

    }

亦蔚然's avatar
亦蔚然 已提交
60 61 62 63 64 65 66 67
    private static void parseUrlsFromAndStoreIntoDatabase(Connection connection, Document doc) throws SQLException {
        for (Element aTag : doc.select("a")) {
            String href = aTag.attr("href");
            insertIntoDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
        }
    }


68 69 70 71 72 73
    /*
     * 3、重构对数据库操作部分的代码
     *
     */
    private static List<String> loadUrlsFromDatabase(Connection connection, String sql) throws SQLException {
        List<String> results = new ArrayList<>();
亦蔚然's avatar
亦蔚然 已提交
74
        ResultSet resultSet = null;
75 76
        try (PreparedStatement statement = connection.prepareStatement(sql)) {
            // 从数据库加载即将处理的代码
亦蔚然's avatar
亦蔚然 已提交
77
            resultSet = statement.executeQuery();
78 79 80
            while (resultSet.next()) {
                results.add(resultSet.getString(1));
            }
亦蔚然's avatar
亦蔚然 已提交
81 82 83 84
        } finally {
            if (resultSet != null) {
                resultSet.close();
            }
85 86
        }
        return results;
87
    }
亦蔚然's avatar
亦蔚然 已提交
88

亦蔚然's avatar
亦蔚然 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
    private static void insertIntoDatabase(Connection connection, String link, String sql) throws SQLException {
        try (PreparedStatement statement = connection.prepareStatement(sql)) {
            statement.setString(1, link);
            statement.executeUpdate();
        }
    }

    private static boolean isLinkProcessed(Connection connection, String link) throws SQLException {
        ResultSet resultSet = null;
        try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) {
            statement.setString(1, link);
            // 从数据库加载即将处理的代码
            resultSet = statement.executeQuery();
            while (resultSet.next()) {
                return true;
            }
        } finally {
            if (resultSet != null) {
                resultSet.close();
            }
        }
        return false;
    }
//    private static List<String> deleteFromDatabase(Connection connection, String sql) throws SQLException {
//        List<String> results = new ArrayList<>();
//        try (PreparedStatement statement = connection.prepareStatement("delete FROM LINKS_TO_BE_PROCESSED where LINK=?")) {
//            statement.setString(1,link);
//            // 从数据库加载即将处理的代码
//            statement.executeUpdate();
//        }
//        return results;
//    }

亦蔚然's avatar
亦蔚然 已提交
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181

    /*
     * 2、将表达不同逻辑的代码抽象为短方法
     * 优点:
     * a.便于人脑理解
     * b.越短越容易复用
     * c.对于Java来说可以方便的对方法进行覆盖
     */
    // 通过http请求拿到HTML文档
    private static Document httpGetAndParseHtml(String link) throws IOException {
        try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
            if (link.startsWith("//")) {
                link = "https:" + link;
            }
            HttpGet httpGet = new HttpGet(link);
            httpGet.addHeader("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
            try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) {
                System.out.println(response1.getStatusLine());
                System.out.println(link);
                HttpEntity entity1 = response1.getEntity();
                String html = EntityUtils.toString(entity1);
                return Jsoup.parse(html);
            }
        }
    }

    // 若是新闻页面就存到数据库中
    private static void storeIntoDatabaseIfItIsNewsPage(Document doc) {
        ArrayList<Element> articleTags = doc.select("article");
        if (!articleTags.isEmpty()) {
            for (Element articleTag : articleTags) {
                String titile = articleTags.get(0).child(0).text();
                System.out.println(titile);
            }
        }
    }

    /*
     * 1、将长的判断条件抽取为不同的方法
     */
    // 感兴趣的链接
    private static boolean isInterestingLink(String link) {
        return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link));
    }

    // 首页
    private static boolean isIndexPage(String link) {
        return "https://sina.cn".equals(link);
    }

    // 新闻页
    private static boolean isNewsPage(String link) {
        return link.contains("news.sina.cn");
    }

    // 登录页
    private static boolean isNotLoginPage(String link) {
        return !link.contains("passport.sina.cn");
    }

182
}