Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
亦蔚然
Project Crawler Elasticsearch
提交
1523aeb9
P
Project Crawler Elasticsearch
项目概览
亦蔚然
/
Project Crawler Elasticsearch
通知
4
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Project Crawler Elasticsearch
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
1523aeb9
编写于
5月 24, 2021
作者:
亦蔚然
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
使用了MyBatis
上级
8c3c3ad7
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
118 addition
and
9 deletion
+118
-9
pom.xml
pom.xml
+16
-0
src/main/java/com/github/weiranyi/Crawler.java
src/main/java/com/github/weiranyi/Crawler.java
+5
-3
src/main/java/com/github/weiranyi/CrawlerDao.java
src/main/java/com/github/weiranyi/CrawlerDao.java
+3
-4
src/main/java/com/github/weiranyi/JdbcCrawlerDao.java
src/main/java/com/github/weiranyi/JdbcCrawlerDao.java
+7
-1
src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java
src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java
+86
-0
src/main/java/com/github/weiranyi/entity/News.java
src/main/java/com/github/weiranyi/entity/News.java
+1
-1
未找到文件。
pom.xml
浏览文件 @
1523aeb9
...
...
@@ -75,6 +75,12 @@
<artifactId>
annotations
</artifactId>
<version>
3.0.1
</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.mybatis/mybatis -->
<dependency>
<groupId>
org.mybatis
</groupId>
<artifactId>
mybatis
</artifactId>
<version>
3.5.6
</version>
</dependency>
<dependency>
<groupId>
org.junit.jupiter
</groupId>
<artifactId>
junit-jupiter-api
</artifactId>
...
...
@@ -90,6 +96,16 @@
</dependencies>
<build>
<resources>
<resource>
<directory>
src/main/resources
</directory>
<includes>
<include>
**/*.properties
</include>
<include>
**/*.xml
</include>
</includes>
<filtering>
true
</filtering>
</resource>
</resources>
<plugins>
<plugin>
<artifactId>
maven-surefire-plugin
</artifactId>
...
...
src/main/java/com/github/weiranyi/Crawler.java
浏览文件 @
1523aeb9
...
...
@@ -17,7 +17,7 @@ import java.util.stream.Collectors;
public
class
Crawler
{
CrawlerDao
dao
=
new
Jdbc
CrawlerDao
();
CrawlerDao
dao
=
new
MyBatis
CrawlerDao
();
public
void
run
()
throws
SQLException
,
IOException
{
String
link
=
null
;
...
...
@@ -33,7 +33,8 @@ public class Crawler {
// 分析页面url将它们放到即将处理的url池子中去
parseUrlsFromAndStoreIntoDatabase
(
doc
);
storeIntoDatabaseIfItIsNewsPage
(
doc
,
link
);
dao
.
updataDatabase
(
link
,
"insert into LINKS_ALREADY_PROCESSED(link) values (?)"
);
dao
.
insertProcessedLinked
(
link
);
// dao.updataDatabase(link, "insert into LINKS_ALREADY_PROCESSED(link) values (?)");
}
else
{
// 不感兴趣
continue
;
...
...
@@ -54,7 +55,8 @@ public class Crawler {
if
(
href
.
toLowerCase
().
startsWith
(
"javascript"
))
{
continue
;
}
dao
.
updataDatabase
(
href
,
"insert into LINKS_TO_BE_PROCESSED(link) values (?)"
);
dao
.
insertLinkToBeProcessed
(
href
);
// dao.updataDatabase(href, "insert into LINKS_TO_BE_PROCESSED(link) values (?)");
}
}
...
...
src/main/java/com/github/weiranyi/CrawlerDao.java
浏览文件 @
1523aeb9
...
...
@@ -3,14 +3,13 @@ package com.github.weiranyi;
import
java.sql.SQLException
;
public
interface
CrawlerDao
{
String
getNextLink
(
String
sql
)
throws
SQLException
;
String
getNextLinkThenDelete
()
throws
SQLException
;
void
updataDatabase
(
String
link
,
String
sql
)
throws
SQLException
;
void
insertNewsIntoDataBase
(
String
url
,
String
title
,
String
content
)
throws
SQLException
;
boolean
isLinkProcessed
(
String
link
)
throws
SQLException
;
void
insertProcessedLinked
(
String
link
);
void
insertLinkToBeProcessed
(
String
href
);
}
src/main/java/com/github/weiranyi/JdbcCrawlerDao.java
浏览文件 @
1523aeb9
...
...
@@ -28,7 +28,7 @@ public class JdbcCrawlerDao implements CrawlerDao {
return
link
;
}
p
ublic
String
getNextLink
(
String
sql
)
throws
SQLException
{
p
rivate
String
getNextLink
(
String
sql
)
throws
SQLException
{
ResultSet
resultSet
=
null
;
try
(
PreparedStatement
statement
=
connection
.
prepareStatement
(
sql
))
{
resultSet
=
statement
.
executeQuery
();
...
...
@@ -75,4 +75,10 @@ public class JdbcCrawlerDao implements CrawlerDao {
}
return
false
;
}
// jdbc
@Override
public
void
insertProcessedLinked
(
String
link
)
{}
@Override
public
void
insertLinkToBeProcessed
(
String
href
)
{}
}
src/main/java/com/github/weiranyi/MyBatisCrawlerDao.java
0 → 100644
浏览文件 @
1523aeb9
package
com.github.weiranyi
;
import
com.github.weiranyi.entity.News
;
import
org.apache.ibatis.io.Resources
;
import
org.apache.ibatis.session.SqlSession
;
import
org.apache.ibatis.session.SqlSessionFactory
;
import
org.apache.ibatis.session.SqlSessionFactoryBuilder
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.sql.SQLException
;
import
java.util.HashMap
;
import
java.util.Map
;
public
class
MyBatisCrawlerDao
implements
CrawlerDao
{
private
SqlSessionFactory
sqlSessionFactory
;
public
MyBatisCrawlerDao
()
{
String
resource
=
"db/mybatis/config.xml"
;
InputStream
inputStream
=
null
;
try
{
inputStream
=
Resources
.
getResourceAsStream
(
resource
);
}
catch
(
IOException
e
)
{
throw
new
RuntimeException
(
e
);
}
sqlSessionFactory
=
new
SqlSessionFactoryBuilder
().
build
(
inputStream
);
}
// 获取下一个链接再删除
@Override
public
String
getNextLinkThenDelete
()
throws
SQLException
{
// SqlSession openSession(boolean autoCommit);这里设计事务,必须提交才生效,要设置参数为true
try
(
SqlSession
session
=
sqlSessionFactory
.
openSession
(
true
))
{
String
url
=
session
.
selectOne
(
"com.github.weiranyi.MyMapper.selectNextAvailableLink"
);
if
(
url
!=
null
)
{
session
.
delete
(
"com.github.weiranyi.MyMapper.deleteLink"
,
url
);
}
return
url
;
}
}
// 插入新闻到数据库
@Override
public
void
insertNewsIntoDataBase
(
String
url
,
String
title
,
String
content
)
throws
SQLException
{
try
(
SqlSession
session
=
sqlSessionFactory
.
openSession
(
true
))
{
session
.
insert
(
"com.github.weiranyi.MyMapper.insertNews"
,
new
News
(
url
,
title
,
content
));
}
}
//
@Override
public
boolean
isLinkProcessed
(
String
link
)
throws
SQLException
{
try
(
SqlSession
session
=
sqlSessionFactory
.
openSession
(
true
))
{
int
count
=
(
Integer
)
session
.
selectOne
(
"com.github.weiranyi.MyMapper.countLink"
,
link
);
return
count
!=
0
;
}
}
// 高级MyBatis操作
@Override
public
void
insertProcessedLinked
(
String
link
)
{
// 创建一个map集合对象
Map
<
String
,
Object
>
param
=
new
HashMap
<>();
// 设置表名
param
.
put
(
"tableName"
,
"links_already_processed"
);
// 设置链接
param
.
put
(
"link"
,
link
);
// SqlSession openSession(boolean autoCommit);这里设计事务,必须提交才生效,要设置参数为true
try
(
SqlSession
session
=
sqlSessionFactory
.
openSession
(
true
))
{
session
.
insert
(
"com.github.weiranyi.MyMapper.insertLink"
,
param
);
}
}
@Override
public
void
insertLinkToBeProcessed
(
String
href
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<>();
// 设置表名
param
.
put
(
"tableName"
,
"links_to_be_processed"
);
// 设置链接
param
.
put
(
"link"
,
href
);
// SqlSession openSession(boolean autoCommit);这里设计事务,必须提交才生效,要设置参数为true
try
(
SqlSession
session
=
sqlSessionFactory
.
openSession
(
true
))
{
session
.
insert
(
"com.github.weiranyi.MyMapper.insertLink"
,
param
);
}
}
}
src/main/java/com/github/weiranyi/News.java
→
src/main/java/com/github/weiranyi/
entity/
News.java
浏览文件 @
1523aeb9
package
com.github.weiranyi
;
package
com.github.weiranyi
.entity
;
/**
* @author: https://github.com/weiranyi
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录