diff --git a/pom.xml b/pom.xml index d8e528ce863da5b212255a8fdf26a002ea132999..8fc355c23bd489f3fd81739afa5651267f084f9e 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,16 @@ commons-lang3 3.1 + + com.alibaba + fastjson + 1.2.83 + + + org.jsoup + jsoup + 1.15.3 + diff --git a/src/main/java/com/kwan/shuyu/controller/ContentController.java b/src/main/java/com/kwan/shuyu/controller/ContentController.java new file mode 100644 index 0000000000000000000000000000000000000000..accb6d4d65e041be72b542c3ec5a16c78f445ca5 --- /dev/null +++ b/src/main/java/com/kwan/shuyu/controller/ContentController.java @@ -0,0 +1,61 @@ +package com.kwan.shuyu.controller; + +import com.kwan.shuyu.service.ContentService; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Controller; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.ResponseBody; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +@Controller +public class ContentController { + + @Autowired + private ContentService contentService; + + /** + * 从京东爬取关键字信息(title、price、src)存放到es中 + * + * @param keyword + * @return + * @throws Exception + */ + @GetMapping("/parse/{keyword}") + @ResponseBody + public Boolean parse(@PathVariable String keyword) throws Exception { + return contentService.parseContent(keyword); + } + + /** + * 从es中查询相关的信息 参数一:关键字 参数二:页码 参数三:显示条数 + * + * @param keyword + * @param pageNo + * @param pageSize + * @return + * @throws IOException + */ + @GetMapping("/search/{keyword}/{pageNo}/{pageSize}") + @ResponseBody + public List> search(@PathVariable("keyword") String keyword, + @PathVariable("pageNo") int pageNo, + @PathVariable("pageSize") int pageSize) throws IOException { + if (pageNo == 0) { + pageNo = 1; + } + if (pageSize == 0) { + pageSize = 5; + } + return contentService.searchPage(keyword, pageNo, pageSize); + } + + @GetMapping("/parse/add2es") + public String test(String keyword) { + System.out.println(keyword); + return "redirect:/parse/" + keyword; + } +} \ No newline at end of file diff --git a/src/main/java/com/kwan/shuyu/controller/EsCourseController.java b/src/main/java/com/kwan/shuyu/controller/EsCourseController.java index ddb75efb3a00adec537021e7fbc7d145079a8db9..f4d2bf5f8f8dde8b67319a24ab401d8d29ac7c43 100644 --- a/src/main/java/com/kwan/shuyu/controller/EsCourseController.java +++ b/src/main/java/com/kwan/shuyu/controller/EsCourseController.java @@ -3,7 +3,7 @@ package com.kwan.shuyu.controller; import com.kwan.shuyu.domain.CoursePub; import com.kwan.shuyu.domain.CourseSearchParam; import com.kwan.shuyu.domain.QueryResponseResult; -import com.kwan.shuyu.service.EsCourseServiceImpl; +import com.kwan.shuyu.service.impl.EsCourseServiceImpl; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; diff --git a/src/main/java/com/kwan/shuyu/domain/Content.java b/src/main/java/com/kwan/shuyu/domain/Content.java new file mode 100644 index 0000000000000000000000000000000000000000..ec66dcd849db2c627ba334d992a5cc693bcf4f91 --- /dev/null +++ b/src/main/java/com/kwan/shuyu/domain/Content.java @@ -0,0 +1,10 @@ +package com.kwan.shuyu.domain; + +import lombok.Data; + +@Data +public class Content { + private String title; + private String img; + private String price; +} diff --git a/src/main/java/com/kwan/shuyu/service/ContentService.java b/src/main/java/com/kwan/shuyu/service/ContentService.java new file mode 100644 index 0000000000000000000000000000000000000000..7046bc92a1555ffcb45383008140bc99c58e8443 --- /dev/null +++ b/src/main/java/com/kwan/shuyu/service/ContentService.java @@ -0,0 +1,27 @@ +package com.kwan.shuyu.service; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public interface ContentService { + /** + * 解析内容 + * + * @param keywords + * @return + * @throws Exception + */ + Boolean parseContent(String keywords) throws Exception; + + /** + * 获取这些数据实现搜索功能 + * + * @param keyword + * @param pageNo + * @param pageSize + * @return + * @throws IOException + */ + List> searchPage(String keyword, int pageNo, int pageSize) throws IOException; +} \ No newline at end of file diff --git a/src/main/java/com/kwan/shuyu/service/impl/EsCourseService.java b/src/main/java/com/kwan/shuyu/service/EsCourseService.java similarity index 91% rename from src/main/java/com/kwan/shuyu/service/impl/EsCourseService.java rename to src/main/java/com/kwan/shuyu/service/EsCourseService.java index a4816e6f4094e44e2804a3d6e5b2a0d40148768c..1e42c3ac1785c8291d929c1fdf6453565b8d306e 100644 --- a/src/main/java/com/kwan/shuyu/service/impl/EsCourseService.java +++ b/src/main/java/com/kwan/shuyu/service/EsCourseService.java @@ -1,4 +1,4 @@ -package com.kwan.shuyu.service.impl; +package com.kwan.shuyu.service; import com.kwan.shuyu.domain.CoursePub; import com.kwan.shuyu.domain.CourseSearchParam; diff --git a/src/main/java/com/kwan/shuyu/service/impl/ContentServiceImpl.java b/src/main/java/com/kwan/shuyu/service/impl/ContentServiceImpl.java new file mode 100644 index 0000000000000000000000000000000000000000..3e8104fa5575d05dd534509369ad35821b491faa --- /dev/null +++ b/src/main/java/com/kwan/shuyu/service/impl/ContentServiceImpl.java @@ -0,0 +1,93 @@ +package com.kwan.shuyu.service.impl; + +import com.alibaba.fastjson.JSON; +import com.kwan.shuyu.domain.Content; +import com.kwan.shuyu.service.ContentService; +import com.kwan.shuyu.util.HtmlParseUtil; +import org.elasticsearch.action.bulk.BulkRequest; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.RestHighLevelClient; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.index.query.TermQueryBuilder; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +@Service +public class ContentServiceImpl implements ContentService { + + @Autowired + private RestHighLevelClient restHighLevelClient; + @Autowired + private HtmlParseUtil htmlParseUtil; + + //1.解析数据放入es索引中 + @Override + public Boolean parseContent(String keywords) throws Exception { + List contents = htmlParseUtil.parseJD(keywords); + //把查询出来的数据放入es里面 + BulkRequest bulkRequest = new BulkRequest(); + bulkRequest.timeout("2m"); + for (int i = 0; i < contents.size(); i++) { + System.out.println(contents.get(i)); + bulkRequest.add( + new IndexRequest("jd_goods_2", keywords + "") + .source(JSON.toJSONString(contents.get(i)), XContentType.JSON)); + } + BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT); + return !bulk.hasFailures(); + } + + /** + * 获取这些数据实现搜索功能 + * + * @param keyword + * @param pageNo + * @param pageSize + * @return + * @throws IOException + */ + @Override + public List> searchPage(String keyword, int pageNo, int pageSize) throws IOException { + if (pageNo <= 1) { + pageNo = 1; + } + //条件搜索 + SearchRequest searchRequest = new SearchRequest("jd_goods"); + //资源构造器(封装查询条件) + SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); + + //分页 + sourceBuilder.from(pageNo); + sourceBuilder.size(pageSize); + + //精准匹配 + TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword); + sourceBuilder.query(termQueryBuilder); + sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS)); + + //执行搜索 + searchRequest.source(sourceBuilder); + SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT); + + //解析结果 + List> list = new ArrayList<>(); + for (SearchHit documentFields : searchResponse.getHits().getHits()) { + list.add(documentFields.getSourceAsMap()); + } + return list; + } +} diff --git a/src/main/java/com/kwan/shuyu/service/EsCourseServiceImpl.java b/src/main/java/com/kwan/shuyu/service/impl/EsCourseServiceImpl.java similarity index 98% rename from src/main/java/com/kwan/shuyu/service/EsCourseServiceImpl.java rename to src/main/java/com/kwan/shuyu/service/impl/EsCourseServiceImpl.java index f6df4256627cf2f00814209efc185843afb82694..fb35a7bb6b677501fb35415086e3b72381b83fd0 100644 --- a/src/main/java/com/kwan/shuyu/service/EsCourseServiceImpl.java +++ b/src/main/java/com/kwan/shuyu/service/impl/EsCourseServiceImpl.java @@ -1,7 +1,7 @@ -package com.kwan.shuyu.service; +package com.kwan.shuyu.service.impl; import com.kwan.shuyu.domain.*; -import com.kwan.shuyu.service.impl.EsCourseService; +import com.kwan.shuyu.service.EsCourseService; import org.apache.commons.lang3.StringUtils; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; diff --git a/src/main/java/com/kwan/shuyu/util/HtmlParseUtil.java b/src/main/java/com/kwan/shuyu/util/HtmlParseUtil.java new file mode 100644 index 0000000000000000000000000000000000000000..fa673668f33669e1721e6546344571c15daf8f84 --- /dev/null +++ b/src/main/java/com/kwan/shuyu/util/HtmlParseUtil.java @@ -0,0 +1,47 @@ +package com.kwan.shuyu.util; + +import com.kwan.shuyu.domain.Content; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.stereotype.Component; + +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + + +/** + * 爬虫获取京东数据并解析 + * + * @author : qinyingjie + * @version : 2.2.0 + * @date : 2023/5/14 01:33 + */ +@Component +public class HtmlParseUtil { + public List parseJD(String keywords) throws Exception { + //1.获取请求 + String url = "https://search.jd.com/Search?keyword=" + keywords; + //2.解析网页(Jsoup返回Document就是浏览器的Document对象) + Document document = Jsoup.parse(new URL(url), 30000); + Element ele = document.getElementById("J_goodsList"); + List list = new ArrayList<>(); + //获取所有的li标签 + Elements tag_lis = ele.getElementsByTag("li"); + //获取元素中的内容,这里每个element就是li标签 + for (Element element : tag_lis) { + String img = element.getElementsByTag("img").eq(0).attr("src"); + String price = element.getElementsByClass("p-price").eq(0).text(); + String title = element.getElementsByClass("p-name").eq(0).text(); + //封装对象 + Content content = new Content(); + content.setTitle(title); + content.setImg(img); + content.setPrice(price); + list.add(content); + } + return list; + } +} \ No newline at end of file diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 22f010da5d93d4d9122672454d2c2c1ed9cfa470..54701633e169f86754fbd583dd70403dd97c1c70 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -6,6 +6,6 @@ spring: name: search-service kwan: elasticsearch: - hostlist: 47.119.160.231:9200 #多个节点用逗号分隔 + hostlist: 47.119.162.180:9200 #多个节点用逗号分隔 course: #课程字段 source_field: id,name,grade,mt,st,charge,valid,pic,qq,price,price_old,status,studymodel,teachmode,expires,pub_time,start_time,end_time \ No newline at end of file