diff --git a/images/V1/V1.png b/images/V1/V1.png new file mode 100644 index 0000000000000000000000000000000000000000..c2873f859f7f3d7d5438dbd65909bfa48b8a9b69 Binary files /dev/null and b/images/V1/V1.png differ diff --git a/images/V1/v1_1.png b/images/V1/v1_1.png deleted file mode 100644 index 0de81f4cbef4efdc96486e1701136c2314610b80..0000000000000000000000000000000000000000 Binary files a/images/V1/v1_1.png and /dev/null differ diff --git a/src/main/java/com/github/weiranyi/Crawler.java b/src/main/java/com/github/weiranyi/Crawler.java new file mode 100644 index 0000000000000000000000000000000000000000..33b5610f39adc16b9f37b93bf8f93b7fc9e16fa4 --- /dev/null +++ b/src/main/java/com/github/weiranyi/Crawler.java @@ -0,0 +1,103 @@ +package com.github.weiranyi; + +import org.apache.http.HttpEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.io.IOException; +import java.sql.*; +import java.util.ArrayList; +import java.util.stream.Collectors; + + +public class Crawler { + CrawlerDao dao = new JdbcCrawlerDao(); + + public void run() throws SQLException, IOException { + String link = null; + // 从数据库中加载下一个链接,若能加载到则进行下一个循环 + while ((link = dao.getNextLinkThenDelete()) != null) { + // 若链接已经处理过了就跳到下一次循环 + if (dao.isLinkProcessed(link)) { + continue; + } + // 判断是否是感兴趣滴内容【新浪站内的网页】 + if (isInterestingLink(link)) { + Document doc = httpGetAndParseHtml(link); + // 分析页面url将它们放到即将处理的url池子中去 + parseUrlsFromAndStoreIntoDatabase(doc); + storeIntoDatabaseIfItIsNewsPage(doc, link); + dao.updataDatabase(link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)"); + } else { + // 不感兴趣 + continue; + } + } + } + + public static void main(String[] args) throws IOException, SQLException { + new Crawler().run(); + } + + private void parseUrlsFromAndStoreIntoDatabase(Document doc) throws SQLException { + for (Element aTag : doc.select("a")) { + String href = aTag.attr("href"); + if (href.startsWith("//")) { + href = "https:" + href; + } + if (href.toLowerCase().startsWith("javascript")) { + continue; + } + dao.updataDatabase(href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)"); + } + } + + private static Document httpGetAndParseHtml(String link) throws IOException { + try (CloseableHttpClient httpclient = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet(link); + httpGet.addHeader("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"); + try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) { + System.out.println(response1.getStatusLine()); + System.out.println(link); + HttpEntity entity1 = response1.getEntity(); + String html = EntityUtils.toString(entity1); + return Jsoup.parse(html); + } + } + } + + private void storeIntoDatabaseIfItIsNewsPage(Document doc, String link) throws SQLException { + ArrayList articleTags = doc.select("article"); + if (!articleTags.isEmpty()) { + for (Element articleTag : articleTags) { + String title = articleTags.get(0).child(0).text(); + // Collectors.joining("\n")得到的字符串用换行符分隔 + String content = articleTag.select("p").stream().map(Element::text).collect(Collectors.joining("\n")); + System.out.println(title); + dao.insertNewsIntoDataBase(link, title, content); + } + } + } + + private static boolean isInterestingLink(String link) { + return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link)); + } + + private static boolean isIndexPage(String link) { + return "https://sina.cn".equals(link); + } + + private static boolean isNewsPage(String link) { + return link.contains("news.sina.cn"); + } + + private static boolean isNotLoginPage(String link) { + return !link.contains("passport.sina.cn"); + } +} diff --git a/src/main/java/com/github/weiranyi/CrawlerDao.java b/src/main/java/com/github/weiranyi/CrawlerDao.java new file mode 100644 index 0000000000000000000000000000000000000000..ffe77e5811c38992f4a0a55e4ba3890f8bbecc37 --- /dev/null +++ b/src/main/java/com/github/weiranyi/CrawlerDao.java @@ -0,0 +1,16 @@ +package com.github.weiranyi; + +import java.sql.SQLException; + +public interface CrawlerDao { + String getNextLink(String sql) throws SQLException; + + String getNextLinkThenDelete() throws SQLException; + + void updataDatabase(String link, String sql) throws SQLException; + + void insertNewsIntoDataBase(String url, String title, String content) throws SQLException; + + boolean isLinkProcessed(String link) throws SQLException; + +} diff --git a/src/main/java/com/github/weiranyi/JdbcCrawlerDao.java b/src/main/java/com/github/weiranyi/JdbcCrawlerDao.java new file mode 100644 index 0000000000000000000000000000000000000000..586b899b8a35384748c6c93a9947d42770d8fc27 --- /dev/null +++ b/src/main/java/com/github/weiranyi/JdbcCrawlerDao.java @@ -0,0 +1,78 @@ +package com.github.weiranyi; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; + +import java.sql.*; + +public class JdbcCrawlerDao implements CrawlerDao { + private static final String USER_NAME = "root"; + private static final String USER_PASSWORD = "123456"; + private final Connection connection; + + @SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD") + public JdbcCrawlerDao() { + try { + this.connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD); + } catch (SQLException e) { + throw new RuntimeException(e); + } + + } + + + public String getNextLinkThenDelete() throws SQLException { + String link = getNextLink("select link from LINKS_TO_BE_PROCESSED limit 1;"); + if (link != null) { + updataDatabase(link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?"); + } + return link; + } + + public String getNextLink(String sql) throws SQLException { + ResultSet resultSet = null; + try (PreparedStatement statement = connection.prepareStatement(sql)) { + resultSet = statement.executeQuery(); + while (resultSet.next()) { + return resultSet.getString(1); + } + } finally { + if (resultSet != null) { + resultSet.close(); + } + } + return null; + } + + public void updataDatabase(String link, String sql) throws SQLException { + try (PreparedStatement statement = connection.prepareStatement(sql)) { + statement.setString(1, link); + statement.executeUpdate(); + } + } + + public void insertNewsIntoDataBase(String url, String title, String content) throws SQLException { + try (PreparedStatement statement = connection.prepareStatement("insert into news (url, title, content, created_at,MODIFIED_AT)values(?,?,?,now(),now())")) { + statement.setString(1, url); + statement.setString(2, title); + statement.setString(3, content); + statement.executeUpdate(); + } + } + + public boolean isLinkProcessed(String link) throws SQLException { + ResultSet resultSet = null; + try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) { + statement.setString(1, link); + // 从数据库加载即将处理的代码 + resultSet = statement.executeQuery(); + while (resultSet.next()) { + return true; + } + } finally { + if (resultSet != null) { + resultSet.close(); + } + } + return false; + } +} diff --git a/src/main/java/com/github/weiranyi/Main.java b/src/main/java/com/github/weiranyi/Main.java deleted file mode 100644 index 66d951577142bfbb8a8abc596fd8f1ca2ea0492b..0000000000000000000000000000000000000000 --- a/src/main/java/com/github/weiranyi/Main.java +++ /dev/null @@ -1,182 +0,0 @@ -package com.github.weiranyi; - -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import org.apache.http.HttpEntity; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.util.EntityUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.io.IOException; -import java.sql.*; -import java.util.ArrayList; -import java.util.stream.Collectors; - - -public class Main { - private static final String USER_NAME = "root"; - private static final String USER_PASSWORD = "123456"; - - - @SuppressFBWarnings("DMI_CONSTANT_DB_PASSWORD") - public static void main(String[] args) throws IOException, SQLException { - Connection connection = DriverManager.getConnection("jdbc:h2:file:/Users/yiweiran/Documents/workPlace/java/JavaProject-Crawler-Elasticsearch/news", USER_NAME, USER_PASSWORD); - String link = null; - // 从数据库中加载下一个链接,若能加载到则进行下一个循环 - while ((link = getNextLinkThenDelete(connection)) != null) { - // 若链接已经处理过了就跳到下一次循环 - if (isLinkProcessed(connection, link)) { - continue; - } - // 判断是否是感兴趣滴内容【新浪站内的网页】 - if (isInterestingLink(link)) { - Document doc = httpGetAndParseHtml(link); - // 分析页面url将它们放到即将处理的url池子中去 - parseUrlsFromAndStoreIntoDatabase(connection, doc); - storeIntoDatabaseIfItIsNewsPage(connection, doc, link); - updataDatabase(connection, link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)"); - } else { - // 不感兴趣 - continue; - } - } - - } - - /* - * 4、优化主干逻辑,进一步重构 - */ - private static String getNextLinkThenDelete(Connection connection) throws SQLException { - String link = getNextLink(connection, "select link from LINKS_TO_BE_PROCESSED limit 1;"); - if (link != null) { - updataDatabase(connection, link, "delete FROM LINKS_TO_BE_PROCESSED where LINK=?"); - } - return link; - } - - private static void parseUrlsFromAndStoreIntoDatabase(Connection connection, Document doc) throws SQLException { - for (Element aTag : doc.select("a")) { - String href = aTag.attr("href"); - if (href.startsWith("//")) { - href = "https:" + href; - } - if (href.toLowerCase().startsWith("javascript")) { - continue; - } - updataDatabase(connection, href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)"); - } - } - - - /* - * 3、重构对数据库操作部分的代码 - */ - private static String getNextLink(Connection connection, String sql) throws SQLException { - ResultSet resultSet = null; - try (PreparedStatement statement = connection.prepareStatement(sql)) { - resultSet = statement.executeQuery(); - while (resultSet.next()) { - return resultSet.getString(1); - } - } finally { - if (resultSet != null) { - resultSet.close(); - } - } - return null; - } - - private static void updataDatabase(Connection connection, String link, String sql) throws SQLException { - try (PreparedStatement statement = connection.prepareStatement(sql)) { - statement.setString(1, link); - statement.executeUpdate(); - } - } - - private static boolean isLinkProcessed(Connection connection, String link) throws SQLException { - ResultSet resultSet = null; - try (PreparedStatement statement = connection.prepareStatement("select link from LINKS_ALREADY_PROCESSED where LINK=?;")) { - statement.setString(1, link); - // 从数据库加载即将处理的代码 - resultSet = statement.executeQuery(); - while (resultSet.next()) { - return true; - } - } finally { - if (resultSet != null) { - resultSet.close(); - } - } - return false; - } - - - /* - * 2、将表达不同逻辑的代码抽象为短方法 - * 优点: - * a.便于人脑理解 - * b.越短越容易复用 - * c.对于Java来说可以方便的对方法进行覆盖 - */ - // 通过http请求拿到HTML文档 - private static Document httpGetAndParseHtml(String link) throws IOException { - try (CloseableHttpClient httpclient = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet(link); - httpGet.addHeader("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"); - try (CloseableHttpResponse response1 = httpclient.execute(httpGet)) { - System.out.println(response1.getStatusLine()); - System.out.println(link); - HttpEntity entity1 = response1.getEntity(); - String html = EntityUtils.toString(entity1); - return Jsoup.parse(html); - } - } - } - - // 若是新闻页面就存到数据库中 - private static void storeIntoDatabaseIfItIsNewsPage(Connection connection, Document doc, String link) throws SQLException { - ArrayList articleTags = doc.select("article"); - if (!articleTags.isEmpty()) { - for (Element articleTag : articleTags) { - String title = articleTags.get(0).child(0).text(); - // Collectors.joining("\n")得到的字符串用换行符分隔 - String content = articleTag.select("p").stream().map(Element::text).collect(Collectors.joining("\n")); - System.out.println(title); - try (PreparedStatement statement = connection.prepareStatement("insert into news(url,title,content,created_at,MODIFIED_AT)VALUES ( ?,?,?,now(),now() )")) { - statement.setString(1, link); - statement.setString(2, title); - statement.setString(3, content); - statement.executeUpdate(); - } - } - } - } - - /* - * 1、将长的判断条件抽取为不同的方法 - */ - // 感兴趣的链接 - private static boolean isInterestingLink(String link) { - return (isNewsPage(link) || isIndexPage(link) && isNotLoginPage(link)); - } - - // 首页 - private static boolean isIndexPage(String link) { - return "https://sina.cn".equals(link); - } - - // 新闻页 - private static boolean isNewsPage(String link) { - return link.contains("news.sina.cn"); - } - - // 登录页 - private static boolean isNotLoginPage(String link) { - return !link.contains("passport.sina.cn"); - } - -} diff --git a/src/main/java/com/github/weiranyi/News.java b/src/main/java/com/github/weiranyi/News.java new file mode 100644 index 0000000000000000000000000000000000000000..de6c8cf5a2d9375c047208f519510a2140b7feb2 --- /dev/null +++ b/src/main/java/com/github/weiranyi/News.java @@ -0,0 +1,57 @@ +package com.github.weiranyi; + +/** + * @author: https://github.com/weiranyi + * @description 这是一个新闻类 + * @date: 2021/5/22 9:00 下午 + * @Version 1.0 + * ''; + */ +public class News { + private Integer id; + private String url; + private String content; + private String title; + + public News() { + + } + + public News(String url, String content, String title) { + this.url = url; + this.content = content; + this.title = title; + } + + public Integer getId() { + return id; + } + + public void setId(Integer id) { + this.id = id; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } +}