Retrofit web crawler example.

dc86914d · jwilson · cf2cdc4d · dc86914d · dc86914d · dc86914d
隐藏空白更改
内联并排

Showing with 154 addition and 0 deletion

pom.xml pom.xml +3 -0

samples/pom.xml samples/pom.xml +5 -0

samples/src/main/java/com/example/retrofit/Crawler.java samples/src/main/java/com/example/retrofit/Crawler.java +146 -0

未找到文件。
--- a/pom.xml
+++ b/pom.xml
@@ -63,6 +63,9 @@
    <simplexml.version>2.7.1</simplexml.version>
    <moshi.version>1.1.0</moshi.version>

+    <!-- Sample Dependencies -->
+    <jsoup.version>1.7.3</jsoup.version>
+
    <!-- Test Dependencies -->
    <junit.version>4.12</junit.version>
    <assertj.version>1.7.0</assertj.version>

--- a/samples/pom.xml
+++ b/samples/pom.xml
@@ -42,6 +42,11 @@
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
    </dependency>
+    <dependency>
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+      <version>${jsoup.version}</version>
+    </dependency>
  </dependencies>

  <build>

--- a/samples/src/main/java/com/example/retrofit/Crawler.java
+++ b/samples/src/main/java/com/example/retrofit/Crawler.java
+/*
+ * Copyright (C) 2016 Square, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.example.retrofit;
+
+import java.io.IOException;
+import java.lang.annotation.Annotation;
+import java.lang.reflect.Type;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import okhttp3.ConnectionPool;
+import okhttp3.Dispatcher;
+import okhttp3.HttpUrl;
+import okhttp3.OkHttpClient;
+import okhttp3.ResponseBody;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import retrofit2.Call;
+import retrofit2.Callback;
+import retrofit2.Converter;
+import retrofit2.Response;
+import retrofit2.Retrofit;
+import retrofit2.http.GET;
+import retrofit2.http.Url;
+
+/** A simple web crawler that uses a Retrofit service to turn URLs into webpages. */
+public final class Crawler {
+  private final Set<HttpUrl> fetchedUrls = Collections.synchronizedSet(
+      new LinkedHashSet<HttpUrl>());
+  private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
+  private final PageService pageService;
+
+  public Crawler(PageService pageService) {
+    this.pageService = pageService;
+  }
+
+  public void crawlPage(HttpUrl url) {
+    // Skip hosts that we've visited many times.
+    AtomicInteger hostnameCount = new AtomicInteger();
+    AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
+    if (previous != null) hostnameCount = previous;
+    if (hostnameCount.incrementAndGet() > 100) return;
+
+    // Asynchronously visit URL.
+    pageService.get(url).enqueue(new Callback<Page>() {
+      @Override public void onResponse(Call<Page> call, Response<Page> response) {
+        if (!response.isSuccessful()) {
+          System.out.println(call.request().url() + ": failed: " + response.code());
+          return;
+        }
+
+        // Print this page's URL and title.
+        Page page = response.body();
+        HttpUrl base = response.raw().request().url();
+        System.out.println(base + ": " + page.title);
+
+        // Enqueue its links for visiting.
+        for (String link : page.links) {
+          HttpUrl linkUrl = base.resolve(link);
+          if (linkUrl != null && !fetchedUrls.add(linkUrl)) {
+            crawlPage(linkUrl);
+          }
+        }
+      }
+
+      @Override public void onFailure(Call<Page> call, Throwable t) {
+        System.out.println(call.request().url() + ": failed: " + t);
+      }
+    });
+  }
+
+  public static void main(String... args) throws Exception {
+    Dispatcher dispatcher = new Dispatcher(Executors.newFixedThreadPool(20));
+    dispatcher.setMaxRequests(20);
+    dispatcher.setMaxRequestsPerHost(1);
+
+    OkHttpClient okHttpClient = new OkHttpClient.Builder()
+        .dispatcher(dispatcher)
+        .connectionPool(new ConnectionPool(100, 30, TimeUnit.SECONDS))
+        .build();
+
+    Retrofit retrofit = new Retrofit.Builder()
+        .baseUrl(HttpUrl.parse("https://example.com/"))
+        .addConverterFactory(PageAdapter.FACTORY)
+        .client(okHttpClient)
+        .build();
+
+    PageService pageService = retrofit.create(PageService.class);
+
+    Crawler crawler = new Crawler(pageService);
+    crawler.crawlPage(HttpUrl.parse(args[0]));
+  }
+
+  interface PageService {
+    @GET Call<Page> get(@Url HttpUrl url);
+  }
+
+  static class Page {
+    public final String title;
+    public final List<String> links;
+
+    public Page(String title, List<String> links) {
+      this.title = title;
+      this.links = links;
+    }
+  }
+
+  static final class PageAdapter implements Converter<ResponseBody, Page> {
+    static final Converter.Factory FACTORY = new Converter.Factory() {
+      @Override public Converter<ResponseBody, ?> responseBodyConverter(
+          Type type, Annotation[] annotations, Retrofit retrofit) {
+        if (type == Page.class) return new PageAdapter();
+        return null;
+      }
+    };
+
+    @Override public Page convert(ResponseBody responseBody) throws IOException {
+      Document document = Jsoup.parse(responseBody.string());
+      List<String> links = new ArrayList<>();
+      for (Element element : document.select("a[href]")) {
+        links.add(element.attr("href"));
+      }
+      return new Page(document.title(), Collections.unmodifiableList(links));
+    }
+  }
+}