提交 434356ad 编写于 作者: 智布道's avatar 智布道 👁

“文章搬运工”支持博客园的文章迁移

上级 78d2180a
......@@ -81,7 +81,7 @@ public class RestApiController {
Map<String, Object> resultMap = new HashMap<>(3);
resultMap.put("success", 1);
resultMap.put("message", "上传成功");
resultMap.put("filename", config.getQiuniuBasePath() + filePath + "-pw");
resultMap.put("filename", config.getQiuniuBasePath() + filePath);
return resultMap;
}
......
......@@ -16,7 +16,7 @@ server:
# SPRING PROFILES
spring:
profiles:
active: @profileActive@
active: '@profileActive@'
application:
name: blog-admin
freemarker:
......@@ -81,7 +81,7 @@ spring:
# 连接池中的最小空闲连接
min-idle: 0
# 连接超时时间(毫秒)
timeout: 0ms
timeout: 5000ms
# 默认的数据过期时间,主要用于shiro权限管理
expire: 2592000
banner:
......
......@@ -20,17 +20,23 @@
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3" for="platform">选择博文平台 <span class="required">*</span></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<select name="platform" id="platform" class="form-control" required="required">
<option value="">请选择</option>
<option value="imooc">慕课网</option>
<option value="csdn">CSDN</option>
<option value="iteye">ITeye</option>
<option value="">待续...</option>
</select>
<select name="platform" id="platform" class="form-control" required="required"></select>
</div>
</div>
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3" for="platform">文章分类 <span class="required">*</span></label>
<label class="control-label col-md-3 col-sm-3 col-xs-3" for="checkbox">自动转存图片 </label>
<div class="col-md-9 col-sm-9 col-xs-9" style="line-height: 40px;">
<input type="checkbox" class="square" name="convertImg">
</div>
</div>
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3"></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<i class="fa fa-exclamation-circle"></i> 勾选时默认将文章中的图片转存到七牛云中(需提前配置七牛云)
</div>
</div>
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3" for="typeId">文章分类 <span class="required">*</span></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<select name="typeId" id="typeId" class="form-control" required="required"></select>
</div>
......@@ -44,16 +50,17 @@
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3"></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<i class="fa fa-exclamation-circle"></i> 各平台“用户ID”获取方式:
<i class="fa fa-exclamation-circle"></i> 获取方式:
<a href="javascript:;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/uid/imooc.png" data-title="慕课网“用户ID”获取方式">慕课网</a> |
<a href="javascript:;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/uid/csdn.png" data-title="CSDN“用户ID”获取方式">CSDN</a> |
<a href="javascript:;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/uid/iteye.png" data-title="ITeye“用户ID”获取方式">ITeye</a>
<a href="javascript:;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/uid/iteye.png" data-title="ITeye“用户ID”获取方式">ITeye</a> |
<a href="javascript:;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/uid/cnblogs.png" data-title="博客园“用户ID”获取方式">博客园</a>
</div>
</div>
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3" for="totalPage">文章总页数 <span class="required">*</span></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<input type="number" name="totalPage" id="totalPage" class="form-control" required="required">
<input type="number" name="totalPage" id="totalPage" class="form-control" required="required" min="1" value="1">
</div>
</div>
<div class="item form-group">
......@@ -71,7 +78,7 @@
<div class="item form-group">
<label class="control-label col-md-3 col-sm-3 col-xs-3"></label>
<div class="col-md-9 col-sm-9 col-xs-9">
<i class="fa fa-exclamation-circle"></i> 只在需要登陆时才需要设置。Cookie获取方式: <a href="javascript:HandlerInterceptor;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/cookie/cookie.png" data-title="“Cookie”获取方式(通用)">以CSDN为例</a>
<i class="fa fa-exclamation-circle"></i> 需要登陆时设置。Cookie获取方式: <a href="javascript:HandlerInterceptor;" data-toggle="modal" data-target="#helpModal" data-img="/assets/images/spider/cookie/cookie.png" data-title="“Cookie”获取方式(通用)">以CSDN为例</a>
</div>
</div>
<div class="item form-group">
......@@ -167,7 +174,7 @@
<div class="col-md-12">
<button type="button" class="btn btn-success" data-toggle="modal" data-target="#declareModal"><i class="fa fa-truck"> GO!</i></button>
<button type="reset" class="btn btn-default" id="resetBtn"><i class="fa fa-refresh"> 清除</i></button>
<button type="button" class="btn btn-info" id="showResultModal"><i class="fa fa-eye"> 显示日志</i></button>
<button type="button" class="btn btn-info" id="showResultModal" style="display: none;"><i class="fa fa-eye"> 显示日志</i></button>
</div>
</form>
</div>
......@@ -222,7 +229,7 @@
</div>
<div class="modal-body">
<div class="pageFormContent" id="pageFormContent" style="max-height: 300px;height: 300px;overflow-y: auto;">
<code id="message" style="display: block;"></code>
<div id="message" style="display: block;" class="profile_title"></div>
</div>
</div>
<div class="modal-footer">
......@@ -234,6 +241,17 @@
<iframe src="" id="spiderFrame" name="spiderFrame" style="display: none"></iframe>
<@footer>
<script>
(function () {
var platformList = [{"imooc": "慕课网"}, {"csdn": "CSDN"}, {"iteye": "ITeye"}, {"csblogs": "博客园"}];
var platformHtml = '<option value="">请选择</option>';
$.each(platformList, function (i, v) {
$.each(v, function (key, value) {
platformHtml += '<option value="' + key + '">' + value+ '</option>';
});
});
platformHtml += '<option value="">待续...</option>';
$("#platform").html(platformHtml);
} ());
var spiderConfig = {
imooc: {
domain: "www.imooc.com",
......@@ -276,6 +294,20 @@
"Referer=http://{uid}.iteye.com/"
],
entryUrls: 'http://{uid}.iteye.com/?page={curPage}'
},
csblogs: {
domain: "www.cnblogs.com",
titleRegex: "//a[@id=cb_post_title_url]/html()",
authorRegex: "//div[@class=postDesc]/a[1]/html()",
releaseDateRegex: "//span[@id=post-date]/html()",
contentRegex: "//div[@id=cnblogs_post_body]/html()",
targetLinksRegex: ".*www\\.cnblogs\\.com/{uid}/p/[\\w\\d]+\\.html",
tagRegex: "//div[@id=EntryTag]/a/html()",
header: [
"Host=www.cnblogs.com",
"Referer=https://www.cnblogs.com/"
],
entryUrls: 'https://www.cnblogs.com/{uid}/default.html?page={curPage}'
}
};
// 博文平台
......@@ -284,6 +316,7 @@
var $uid = $("#uid");
// 文章总页数
var $totalPage = $("#totalPage");
// 分割字符串的正则
var reg = new RegExp('{\\w+}'), br = "\r\n";
$("#platform, #uid, #totalPage").change(function () {
......@@ -328,9 +361,11 @@
$("#submitBtn").click(function () {
var $form = $("form#removerForm");
if (validator.checkAll($form)) {
$("#declareModal").modal('hide');
$(this).button('loading');
$("#resetBtn").button('loading');
$("#resultModal").modal('show');
$("#showResultModal").show();
$form.submit();
$("#message").html("<p> 程序正在初始化...</p>");
}
......
......@@ -43,9 +43,6 @@ public class BlogAdminApplicationTests {
.setContentRegex("//div[@class=detail-content]/html()")
.setTargetLinksRegex("/article/[0-9]{1,10}")
.setTagRegex("//div[@class=cat-box]/div[@class=cat-wrap]/a[@class=cat]/html()")
.setCookie("IMCDNS=0; imooc_uuid=e7a46d50-1d50-4b67-8a4f-20d56001de3c; imooc_isnew=1; imooc_isnew_ct=1533286936; loginstate=1; apsid=IxZDQ4ZDI2YzQ3YTdmODFjZmI5N2U1YjY1YjhhNDUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMTE3NTI0OAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAByeHh6eWQxMTIzQDE2My5jb20AAAAAAAAAAAAAAAAAAGEyZjc3YmZiYjhhNzQ4NmI4MGMyZDkyZDk2ZGQ4ZmEzJhpkWyYaZFs%3DYm; last_login_username=rxxzyd1123%40163.com; PHPSESSID=a7316sajatt1ud76v2ld5g2db2; cvde=5b641a18ad45b-47")
.setHeader("Host", "www.imooc.com")
.setHeader("Referer", "https://www.imooc.com"));
spider.run();
......@@ -56,16 +53,12 @@ public class BlogAdminApplicationTests {
ZydSpider<Article> spider = new ArticleSpiderProcessor(new CsdnModel().setUid("u011197448")
.setTotalPage(1)
.setDomain("blog.csdn.net")
.setTitleRegex("//h1[@class=title-article]/html()")
.setAuthorRegex("//div[@class=profile-intro]/div[@class=user-info]/p[@class=name]/a[@class=text-truncate]/html()")
.setReleaseDateRegex("//div[@class='article-bar-top']/span[@class='time']/text()")
.setContentRegex("//div[@class=article_content]/html()")
.setTargetLinksRegex(".*blog\\.csdn\\.net/u011197448/article/details/[0-9a-zA-Z]{1,15}")
.setTagRegex("//span[@class=artic-tag-box]/a[@class=tag-link]/html()")
.setCookie("uuid_tt_dd=10_18752534250-1532653661936-548523; __yadk_uid=Azckmtol9B3Q1677fAFIbpA9VhKbK5Ge; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=1788*1*PC_VC; smidV2=2018073110202411619e299fa87e58a6d2fa513eb0560100370be57076aab70; UN=u011197448; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1533203845,1533264901,1533273617,1533281367; dc_session_id=10_1533284174899.975225; UserName=u011197448; UserInfo=Ht4eplKngPM%2BlqynD7AUN60KU8guuVQBsxDzDuvws4EYrbUMrVqt11By0pOsylwQDSz%2FSHACu5aKgWpIqB23EEdftuzXlq3O4vBqE4BXrKs2LsQwkx5jpcpry0Ju%2BUNG; UserNick=%E4%B8%83%E5%BD%A9%E7%8B%BC; AU=9FA; BT=1533284260805; UserToken=Ht4eplKngPM%2BlqynD7AUN60KU8guuVQBsxDzDuvws4EYrbUMrVqt11By0pOsylwQDSz%2FSHACu5aKgWpIqB23EEdftuzXlq3O4vBqE4BXrKurrkHcSkadZ3mIAfmpjySmT5zXMhwfoVAQ3iLltdps42y6%2BBczwI00vyyjJhe6p8p543v2LHcx4Be6xzB43XhR; TY_SESSION_ID=f2813db8-9622-4d89-9242-45e76f4fcbd7; dc_tos=pcvn4x")
.setHeader("Host", "blog.csdn.net")
.setHeader("Referer", "https://blog.csdn.net/u011197448/article/list/1"));
spider.run();
......@@ -76,20 +69,34 @@ public class BlogAdminApplicationTests {
ZydSpider<Article> spider = new ArticleSpiderProcessor(new IteyeModel().setUid("843977358")
.setTotalPage(1)
.setDomain("843977358.iteye.com")
.setTitleRegex("//div[@class=blog_title]/h3/a/html()")
.setAuthorRegex("//div[@id=blog_owner_name]/html()")
.setReleaseDateRegex("//div[@class=blog_bottom]/ul/li/html()")
.setContentRegex("//div[@class=blog_content]/html()")
.setTargetLinksRegex(".*843977358\\.iteye\\.com/blog/[0-9]+")
.setTagRegex("//div[@class=news_tag]/a/html()")
.setCookie("_javaeye_cookie_id_=1533347307845341; dc_session_id=1533711983437_0.44695905064454156; dc_tos=pd4rx2; _javaeye_cookie_id_=1533347307845341; _javaeye3_session_=BAh7CDoMdXNlcl9pZGkDxvkXOhBfY3NyZl90b2tlbiIxV09DNDZKS3hPbjVwK2RJRkxmRURkM09CU2hRYld2UFY4MTc1bFNCZmlwbz06D3Nlc3Npb25faWQiJTZlYWQ0MTJjYTEzNjNiMWE0YjUxNDQ3ZmEzY2ZmMGRi--cd2bd92e73d22ce4c75e48e2d211f72c59b469d1")
.setHeader("Host", "843977358.iteye.com")
.setHeader("Referer", "http://843977358.iteye.com/"));
spider.run();
}
@Test
public void cnblogSpiderTest() {
ZydSpider<Article> spider = new ArticleSpiderProcessor(new CnblogModel().setUid("zhangyadong")
.setTotalPage(1)
.setDomain("www.cnblogs.com")
.setTitleRegex("//a[@id=cb_post_title_url]/html()")
.setAuthorRegex("//div[@class=postDesc]/a[1]/html()")
.setReleaseDateRegex("//span[@id=post-date]/html()")
.setContentRegex("//div[@id=cnblogs_post_body]/html()")
.setTagRegex("//div[@id=EntryTag]/a/html()")
.setTargetLinksRegex(".*www\\.cnblogs\\.com/zhangyadong/p/[\\w\\d]+\\.html")
.setHeader("Host", "www.cnblogs.com")
.setHeader("Referer", "https://www.cnblogs.com/"));
spider.run();
}
@Test
public void spring4AllSpiderTest() {
/*ZydSpider<Article> spider = new ArticleSpiderProcessor(new BaseModel()
......
......@@ -3,10 +3,9 @@ package com.zyd.blog.business.service.impl;
import com.zyd.blog.business.entity.Tags;
import com.zyd.blog.business.entity.User;
import com.zyd.blog.business.enums.ArticleStatusEnum;
import com.zyd.blog.business.service.BizArticleService;
import com.zyd.blog.business.service.BizArticleTagsService;
import com.zyd.blog.business.service.BizTagsService;
import com.zyd.blog.business.service.RemoverService;
import com.zyd.blog.business.service.*;
import com.zyd.blog.business.util.ImageDownloadUtil;
import com.zyd.blog.persistence.beans.SysConfig;
import com.zyd.blog.spider.model.Article;
import com.zyd.blog.spider.model.BaseModel;
import com.zyd.blog.spider.processor.ArticleSpiderProcessor;
......@@ -18,11 +17,12 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
......@@ -36,12 +36,16 @@ import java.util.stream.Collectors;
@Service
public class RemoverServiceImpl implements RemoverService {
private static final Pattern PATTERN = Pattern.compile("<img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
@Autowired
private BizArticleService articleService;
@Autowired
private BizTagsService tagsService;
@Autowired
private BizArticleTagsService articleTagsService;
@Autowired
private SysConfigService sysConfigService;
@Transactional(rollbackFor = Exception.class)
@Override
......@@ -73,9 +77,10 @@ public class RemoverServiceImpl implements RemoverService {
List<Long> tagIds = null;
Tags newTag = null;
User user = SessionUtil.getUser();
String qiniuBasePath = sysConfigService.get().getQiuniuBasePath();
for (Article spiderArticle : list) {
article = new com.zyd.blog.business.entity.Article();
article.setContent(spiderArticle.getContent());
article.setContent(model.isConvertImg() ? parseImgForHtml(spiderArticle.getContent(), qiniuBasePath, writer) : spiderArticle.getContent());
article.setTitle(spiderArticle.getTitle());
article.setTypeId(typeId);
article.setUserId(user.getId());
......@@ -114,4 +119,25 @@ public class RemoverServiceImpl implements RemoverService {
WriterUtil.writer2Html(writer, "全部跑完了~!!!...", String.format("共耗时 %s ms.", (System.currentTimeMillis() - start)));
WriterUtil.shutdown(writer);
}
private String parseImgForHtml(String html, String qiniuBasePath, PrintWriter writer) {
if (StringUtils.isEmpty(html)) {
return null;
}
Matcher m = PATTERN.matcher(html);
Set<String> imgUrlSet = new HashSet<>();
while (m.find()) {
String imgUrl = m.group(1);
imgUrlSet.add(imgUrl);
}
if (!CollectionUtils.isEmpty(imgUrlSet)) {
WriterUtil.writer2Html(writer, " > 开始转存图片到七牛云...");
for (String imgUrl : imgUrlSet) {
String qiniuImgPath = ImageDownloadUtil.convertToQiniu(imgUrl);
html = html.replaceAll(imgUrl, qiniuBasePath + qiniuImgPath);
WriterUtil.writer2Html(writer, String.format(" >> <a href=\"%s\" target=\"_blank\">原图片</a> convert to <a href=\"%s\" target=\"_blank\">七牛云</a>...", imgUrl, qiniuImgPath));
}
}
return html;
}
}
package com.zyd.blog.business.util;
import com.zyd.blog.business.enums.QiniuUploadType;
import com.zyd.blog.plugin.QiniuApi;
import com.zyd.blog.util.FileUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.*;
import java.util.UUID;
@Slf4j
public class ImageDownloadUtil {
/**
* 将网络图片转存到七牛云
*
* @param imgUrl 网络图片地址
*/
public static String convertToQiniu(String imgUrl) {
log.debug("download img >> %s", imgUrl);
String qiniuImgPath = null;
try (InputStream is = getInputStreamByUrl(checkUrl(imgUrl));
ByteArrayOutputStream outStream = new ByteArrayOutputStream();) {
byte[] buffer = new byte[1024];
int len = 0;
while ((len = is.read(buffer)) != -1) {
outStream.write(buffer, 0, len);
}
qiniuImgPath = QiniuApi.getInstance()
.withFileName("temp." + getSuffixByUrl(imgUrl), QiniuUploadType.SIMPLE)
.upload(outStream.toByteArray());
} catch (IOException e) {
log.error("Error.", e);
}
return qiniuImgPath;
}
private static String getSuffixByUrl(String imgUrl) {
String defaultSuffix = "png";
if (StringUtils.isEmpty(imgUrl)) {
return defaultSuffix;
}
String temStr = imgUrl.substring(imgUrl.lastIndexOf("/"));
int index = temStr.lastIndexOf(".");
return -1 == index ? defaultSuffix : temStr.substring(index + 1);
}
/**
* @param imgUrl 网络图片地址
* @param localPath 待保存的本地地址
*/
public static String download(String imgUrl, String localPath) {
log.debug("download img >> %s", imgUrl);
String fileName = localPath + File.separator + UUID.randomUUID().toString() + "." + getSuffixByUrl(imgUrl);
try (InputStream is = getInputStreamByUrl(checkUrl(imgUrl));
FileOutputStream fos = new FileOutputStream(fileName)) {
if (null == is) {
return null;
}
File file = new File(localPath);
if (!file.exists()) {
file.mkdirs();
}
int bytesWritten = 0, byteCount = 0;
byte[] b = new byte[1024];
while ((byteCount = is.read(b)) != -1) {
fos.write(b, bytesWritten, byteCount);
}
} catch (IOException e) {
log.error("Error.", e);
return null;
}
return fileName;
}
private static InputStream getInputStreamByUrl(String url) {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response = null;
InputStream in = null;
try {
response = httpclient.execute(httpGet);
in = response.getEntity().getContent();
if (response.getStatusLine().getStatusCode() == 200) {
return in;
} else {
log.error("Error. %s", parseInputStream(in));
return null;
}
} catch (IOException e) {
e.printStackTrace();
log.error("Error.", e);
}
return in;
}
private static String parseInputStream(InputStream in) throws IOException {
String result = "";
StringBuffer content = null;
if (null != in) {
BufferedReader r = new BufferedReader(new InputStreamReader(in));
content = new StringBuffer();
String line = "";
while ((line = r.readLine()) != null) {
content.append(line);
}
result = content.toString();
}
return result;
}
/**
* 校验Url,并返回完整的url
*
* @param url 待校验的url
*/
private static String checkUrl(String url) {
if (!StringUtils.isEmpty(url)) {
if (url.startsWith("http://") || url.startsWith("https://")) {
return url;
}
return url.startsWith("//") ? "https:" + url : "http://" + url;
}
return null;
}
}
package com.zyd.blog.util;
import com.zyd.blog.business.util.ImageDownloadUtil;
import org.junit.Test;
import org.springframework.util.CollectionUtils;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @version 1.0
* @website https://www.zhyd.me
* @date 2018/9/6 11:31
* @since 1.8
*/
public class ImageDownloadUtilTest {
@Test
public void imoocTest() {
String html = "<p>目前,该功能已内置了三个平台(imooc、csdn和iteye),根据不同的平台,程序已默认了一套抓取规则,如下图系列<br><img title=\"DBlog开源博客新增博客迁移功能(支持多个站点)_\"图片2=\"\"src=\"//img.mukewang.com/5b7fd07c000125ed18090932.png\"alt=\"图片描述\"style=\"cursor: pointer;\"><br><img title=\"DBlog开源博客新增博客迁移功能(支持多个站点)_\"图片3=\"\"src=\"//img.mukewang.com/5b7fd0870001dce917490934.png\"alt=\"图片描述\"style=\"cursor: pointer;\"><br><img title=\"DBlog开源博客新增博客迁移功能(支持多个站点)_\"图片4=\"\"src=\"//img.mukewang.com/5b7fd08d000190f617610917.png\"alt=\"图片描述\"style=\"cursor: pointer;\"><br><img title=\"DBlog开源博客新增博客迁移功能(支持多个站点)_\"图片5=\"\"src=\"//img.mukewang.com/5b7fd0940001b6c317440936.png\"alt=\"图片描述\"style=\"cursor: pointer;\"></p>";
parseImgForHtml(html);
}
@Test
public void iteyeTest() {
String html = "<p>&nbsp;&nbsp;&nbsp;前段时间在项目中用到了上传头像,并且获取剪切后的头像功能,单一的上传头像很好处理,直接把改文件上传就可以,但是剪切后的头像,它的src却是一个base64字符串,如图:<br><img alt=\"\"src=\"http://dl2.iteye.com/upload/attachment/0109/3648/857431ce-8d0a-35b7-bdee-e3facc7bd0b6.png\"title=\"点击查看原始大小图片\"class=\"magplus\"width=\"699\"height=\"650\"><br>&nbsp;,直接将这个地址当做文件路径上传到后台肯定不行,因为java无法编译改地址,不能识别为一个图片路径。那么,这就用到了对base64位字符串进行解码处理,将其解析为一个可被正确识别的文件。</p>";
parseImgForHtml(html);
}
/**
* 解析img
*
* @param html
* @return
*/
private void parseImgForHtml(String html) {
Pattern p = Pattern.compile("<img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
Matcher m = null;
m = p.matcher(html);
Set<String> imgUrlSet = new HashSet<>();
while (m.find()) {
String imgUrl = m.group(1);
imgUrlSet.add(imgUrl);
}
System.out.println(html);
if (!CollectionUtils.isEmpty(imgUrlSet)) {
for (String imgUrl : imgUrlSet) {
String filePath = "D://var/tmp/";
String localPath = ImageDownloadUtil.download(imgUrl, filePath);
html = html.replaceAll(imgUrl, localPath);
}
}
System.out.println(html);
}
}
......@@ -2,7 +2,10 @@ package com.zyd.blog.spider.model;
import lombok.Data;
import lombok.EqualsAndHashCode;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import java.util.Collections;
import java.util.Date;
import java.util.List;
......@@ -30,9 +33,15 @@ public class Article {
this.content = content;
this.author = author;
this.source = source;
this.tags = tags;
this.description = description;
this.keywords = keywords;
if (CollectionUtils.isEmpty(this.tags = tags)) {
this.tags = Collections.singletonList("OneBlog");
}
if (StringUtils.isEmpty(this.description = description) || "null".equalsIgnoreCase(this.description)) {
this.description = title;
}
if (StringUtils.isEmpty(this.keywords = keywords) || "null".equalsIgnoreCase(this.keywords)) {
this.keywords = title;
}
}
public Article() {
......
......@@ -82,6 +82,8 @@ public class BaseModel {
/* 保留字段,针对ajax渲染的页面 */
private Boolean ajaxRequest = false;
/* 是否转存图片 */
private boolean convertImg = false;
public String getUid() {
return uid;
......
package com.zyd.blog.spider.model;
import lombok.Data;
import lombok.EqualsAndHashCode;
import javax.validation.constraints.NotEmpty;
import java.util.LinkedList;
import java.util.List;
/**
* 本地跑测试用
*
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @version 1.0
* @date 2018/7/23 15:58
* @since 1.0
*/
@Data
@EqualsAndHashCode(callSuper = false)
public class CnblogModel extends BaseModel {
@NotEmpty(message = "必须指定待抓取的网址")
@Override
public String[] getEntryUrls() {
List<String> urls = new LinkedList<>();
String urlFormat = "https://www.cnblogs.com/%s/default.html?page=%s";
for (int i = 1; i <= getTotalPage(); i++) {
urls.add(String.format(urlFormat, getUid(), i));
}
return urls.toArray(new String[urls.size()]);
}
}
......@@ -19,17 +19,21 @@
*/
package com.zyd.blog;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.web.servlet.ServletComponentScan;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.EnableTransactionManagement;
/**
* 程序启动类
*
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @website https://www.zhyd.me
* @version 1.0
* @website https://www.zhyd.me
* @date 2018/4/18 11:48
* @since 1.0
*/
......@@ -38,8 +42,39 @@ import org.springframework.transaction.annotation.EnableTransactionManagement;
@EnableTransactionManagement
public class BlogWebApplication {
public static void main(String[] args) {
SpringApplication.run(BlogWebApplication.class, args);
}
@Value("${spring.redis.password}")
private String password;
@Value("${spring.datasource.username}")
private String username;
@Value("${spring.datasource.password}")
private String sqlPassword;
@Value("${spring.mail.host}")
private String host;
@Value("${spring.mail.username}")
private String mailUsername;
@Value("${spring.mail.password}")
private String mailPassword;
@Value("${app.enableKaptcha}")
private boolean enableKaptcha;
public static void main(String[] args) {
SpringApplication.run(BlogWebApplication.class, args);
}
@Component
class Runner implements ApplicationRunner {
@Override
public void run(ApplicationArguments args) throws Exception {
System.out.println("${spring.redis.password} = " + password);
System.out.println("${spring.datasource.username} = " + username);
System.out.println("${spring.datasource.password} = " + sqlPassword);
System.out.println("${spring.mail.host} = " + host);
System.out.println("${spring.mail.username} = " + mailUsername);
System.out.println("${spring.mail.password} = " + mailPassword);
System.out.println("${app.enableKaptcha} = " + enableKaptcha);
}
}
}
......@@ -16,7 +16,7 @@ server:
# SPRING PROFILES
spring:
profiles:
active: @profileActive@
active: '@profileActive@'
application:
name: blog-web
freemarker:
......@@ -81,7 +81,7 @@ spring:
# 连接池中的最小空闲连接
min-idle: 0
# 连接超时时间(毫秒)
timeout: 0ms
timeout: 5000ms
# 默认的数据过期时间,主要用于shiro权限管理
expire: 2592000
banner:
......
......@@ -2,6 +2,15 @@
----
### 2018-09-11
**新增**
- “文章搬运工”支持博客园的文章迁移
*注:转存图片功能尚不能转存csdn的图片,下一版会更新*
----
### 2018-08-29
**新增**
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册