提交 024c6eff 编写于 作者: 亦蔚然's avatar 亦蔚然

第二次重构

上级 76f28cd4
...@@ -70,6 +70,11 @@ ...@@ -70,6 +70,11 @@
<version>1.4.199</version> <version>1.4.199</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>annotations</artifactId>
<version>3.0.1</version>
</dependency>
<dependency> <dependency>
<groupId>org.junit.jupiter</groupId> <groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId> <artifactId>junit-jupiter-api</artifactId>
......
package com.github.weiranyi; package com.github.weiranyi;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.http.HttpEntity; import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
...@@ -13,42 +14,41 @@ import org.jsoup.nodes.Element; ...@@ -13,42 +14,41 @@ import org.jsoup.nodes.Element;
import java.io.IOException; import java.io.IOException;
import java.sql.*; import java.sql.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
public class Main { public class Main {
private static final String USER_NAME = "root";
private static final String USER_PASSWORD = "123456";
@SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD")
public static void main(String[] args) throws IOException, SQLException { public static void main(String[] args) throws IOException, SQLException {
// 创建一个数据库链接 // 创建一个数据库链接
Connection connection = connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", "root", "123456"); Connection connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD);
// 【待处理】存放待处理的链接的池子
List<String> linkPool = loadUrlsFromDatabase(connection, "select link from LINKS_TO_BE_PROCESSED;");
// 【已处理】存放已经处理的链接
Set<String> processedLinks = new HashSet<>(loadUrlsFromDatabase(connection, "select link from LINKS_ALREADY_PROCESSED;"));
while (true) { while (true) {
// 【待处理】存放待处理的链接的池子
List<String> linkPool = loadUrlsFromDatabase(connection, "select link from LINKS_TO_BE_PROCESSED;");
// 链接池是空的就退出循环 // 链接池是空的就退出循环
if (linkPool.isEmpty()) { if (linkPool.isEmpty()) {
break; break;
} }
// 获取并移除最后一个链接,对于ArrayList来说更有效率 // 获取并移除最后一个链接,对于ArrayList来说更有效率
String link = linkPool.remove(linkPool.size() - 1); String link = linkPool.remove(linkPool.size() - 1);
insertIntoDatabase(connection, link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?");
// 若链接已经处理过了就跳到下一次循环 // 若链接已经处理过了就跳到下一次循环
if (processedLinks.contains(link)) { if (isLinkProcessed(connection, link)) {
continue; continue;
} }
// 判断是否是感兴趣滴内容【新浪站内的网页】 // 判断是否是感兴趣滴内容【新浪站内的网页】
if (isInterestingLink(link)) { if (isInterestingLink(link)) {
Document doc = httpGetAndParseHtml(link); Document doc = httpGetAndParseHtml(link);
// 使用CSS选择器,html中去获取 // 分析页面url将它们放到即将处理的url池子中去
ArrayList<Element> links = doc.select("a"); parseUrlsFromAndStoreIntoDatabase(connection, doc);
// 用Java8引入的特性对代码进行简化,过程式语言变成描述式语言
links.stream().map(aTag -> aTag.attr("href")).forEach(linkPool::add);
// 假设这是一个新闻的详情页,就存入数据库,否则,就什么都不做
storeIntoDatabaseIfItIsNewsPage(doc); storeIntoDatabaseIfItIsNewsPage(doc);
processedLinks.add(link); insertIntoDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
} else { } else {
// 不感兴趣 // 不感兴趣
continue; continue;
...@@ -57,22 +57,68 @@ public class Main { ...@@ -57,22 +57,68 @@ public class Main {
} }
private static void parseUrlsFromAndStoreIntoDatabase(Connection connection, Document doc) throws SQLException {
for (Element aTag : doc.select("a")) {
String href = aTag.attr("href");
insertIntoDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
}
}
/* /*
* 3、重构对数据库操作部分的代码 * 3、重构对数据库操作部分的代码
* *
*/ */
private static List<String> loadUrlsFromDatabase(Connection connection, String sql) throws SQLException { private static List<String> loadUrlsFromDatabase(Connection connection, String sql) throws SQLException {
List<String> results = new ArrayList<>(); List<String> results = new ArrayList<>();
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement(sql)) { try (PreparedStatement statement = connection.prepareStatement(sql)) {
// 从数据库加载即将处理的代码 // 从数据库加载即将处理的代码
ResultSet resultSet = statement.executeQuery(); resultSet = statement.executeQuery();
while (resultSet.next()) { while (resultSet.next()) {
results.add(resultSet.getString(1)); results.add(resultSet.getString(1));
} }
} finally {
if (resultSet != null) {
resultSet.close();
}
} }
return results; return results;
} }
private static void insertIntoDatabase(Connection connection, String link, String sql) throws SQLException {
try (PreparedStatement statement = connection.prepareStatement(sql)) {
statement.setString(1, link);
statement.executeUpdate();
}
}
private static boolean isLinkProcessed(Connection connection, String link) throws SQLException {
ResultSet resultSet = null;
try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) {
statement.setString(1, link);
// 从数据库加载即将处理的代码
resultSet = statement.executeQuery();
while (resultSet.next()) {
return true;
}
} finally {
if (resultSet != null) {
resultSet.close();
}
}
return false;
}
// private static List<String> deleteFromDatabase(Connection connection, String sql) throws SQLException {
// List<String> results = new ArrayList<>();
// try (PreparedStatement statement = connection.prepareStatement("delete FROM LINKS_TO_BE_PROCESSED where LINK=?")) {
// statement.setString(1,link);
// // 从数据库加载即将处理的代码
// statement.executeUpdate();
// }
// return results;
// }
/* /*
* 2、将表达不同逻辑的代码抽象为短方法 * 2、将表达不同逻辑的代码抽象为短方法
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册