Fix some crawler bugs.

We were interpretting links relative to the URLs requested, rather than the redirect targets of those URLs.
2026-01-17 08:42:25 +03:00 · 2015-03-15 22:02:37 -04:00
parent 20a27d8163
commit a75193b134
1 changed files with 14 additions and 4 deletions
--- a/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java
+++ b/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java
@@ -19,6 +19,7 @@ import com.squareup.okhttp.Cache;
 import com.squareup.okhttp.OkHttpClient;
 import com.squareup.okhttp.Request;
 import com.squareup.okhttp.Response;
+import com.squareup.okhttp.internal.NamedRunnable;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
@@ -26,9 +27,11 @@ import java.net.URL;
 import java.util.Collections;
 import java.util.LinkedHashSet;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -40,6 +43,7 @@ public final class Crawler {
  private final OkHttpClient client;
  private final Set<URL> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<URL>());
  private final LinkedBlockingQueue<URL> queue = new LinkedBlockingQueue<>();
+  private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();

  public Crawler(OkHttpClient client) {
    this.client = client;
@@ -48,8 +52,8 @@ public final class Crawler {
  private void parallelDrainQueue(int threadCount) {
    ExecutorService executor = Executors.newFixedThreadPool(threadCount);
    for (int i = 0; i < threadCount; i++) {
-      executor.execute(new Runnable() {
-        @Override public void run() {
+      executor.execute(new NamedRunnable("Crawler %s", i) {
+        @Override protected void execute() {
          try {
            drainQueue();
          } catch (Exception e) {
@@ -76,12 +80,18 @@ public final class Crawler {
  }

  public void fetch(URL url) throws IOException {
+    // Skip hosts that we've visited many times.
+    AtomicInteger hostnameCount = new AtomicInteger();
+    AtomicInteger previous = hostnames.putIfAbsent(url.getHost(), hostnameCount);
+    if (previous != null) hostnameCount = previous;
+    if (hostnameCount.incrementAndGet() > 100) return;
+
    Request request = new Request.Builder()
        .url(url)
        .build();
    Response response = client.newCall(request).execute();
    String responseSource = response.networkResponse() != null
-        ? ("(network: " + response.networkResponse().code() + ")")
+        ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
        : "(cache)";
    int responseCode = response.code();

@@ -96,7 +106,7 @@ public final class Crawler {
    Document document = Jsoup.parse(response.body().string(), url.toString());
    for (Element element : document.select("a[href]")) {
      String href = element.attr("href");
-      URL link = parseUrl(url, href);
+      URL link = parseUrl(response.request().url(), href);
      if (link != null) queue.add(link);
    }
  }