public class BlogLinkCrawler {
private static volatile AtomicInteger atomic = new AtomicInteger(0);
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
String url = "https://bugstack.cn"; // 替换为目标博客网站的URL
FileWriter writer = new FileWriter("urls0.txt", true);
Set<String> links = Collections.synchronizedSet(new HashSet<>());
ExecutorService executorService = Executors.newFixedThreadPool(50); // 创建一个固定大小为10的线程池
new BlogLinkCrawler().crawlLinks(url, links, writer, executorService);
}
public void crawlLinks(String url, Set<String> links, final FileWriter writer, ExecutorService executorService) {
try {
Document document = Jsoup.connect(url).get();
Elements linkElements = document.select("a[href]");
for (Element linkElement : linkElements) {
String link = linkElement.attr("abs:href");
// && !link.contains("#")
if (link.startsWith(url) && !link.contains("#")) {
synchronized (links) {
if (!links.contains(link)) {
links.add(link);
System.out.println(atomic.addAndGet(1) + ": " + link);
executorService.execute(() -> {
try {
crawlLinks(link, links, new FileWriter("urls" + (1 + atomic.get() / 1000) + ".txt", true), executorService);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
writer.write(URLEncoder.encode(link, "UTF-8").replace("%3A%2F%2F", "://").replace("%2F", "/").replace("%23", "#") + "\n");
}
}
}
}
} catch (IOException ignore) {
} finally {
try {
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}