From a75193b134dfe02652bb90dc5cb3fedc794708df Mon Sep 17 00:00:00 2001 From: jwilson Date: Sun, 15 Mar 2015 22:02:37 -0400 Subject: [PATCH] Fix some crawler bugs. We were interpretting links relative to the URLs requested, rather than the redirect targets of those URLs. --- .../com/squareup/okhttp/sample/Crawler.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java b/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java index 24383fe47..21d11c7c3 100644 --- a/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java +++ b/samples/crawler/src/main/java/com/squareup/okhttp/sample/Crawler.java @@ -19,6 +19,7 @@ import com.squareup.okhttp.Cache; import com.squareup.okhttp.OkHttpClient; import com.squareup.okhttp.Request; import com.squareup.okhttp.Response; +import com.squareup.okhttp.internal.NamedRunnable; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; @@ -26,9 +27,11 @@ import java.net.URL; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -40,6 +43,7 @@ public final class Crawler { private final OkHttpClient client; private final Set fetchedUrls = Collections.synchronizedSet(new LinkedHashSet()); private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(); + private final ConcurrentHashMap hostnames = new ConcurrentHashMap<>(); public Crawler(OkHttpClient client) { this.client = client; @@ -48,8 +52,8 @@ public final class Crawler { private void parallelDrainQueue(int threadCount) { ExecutorService executor = Executors.newFixedThreadPool(threadCount); for (int i = 0; i < threadCount; i++) { - executor.execute(new Runnable() { - @Override public void run() { + executor.execute(new NamedRunnable("Crawler %s", i) { + @Override protected void execute() { try { drainQueue(); } catch (Exception e) { @@ -76,12 +80,18 @@ public final class Crawler { } public void fetch(URL url) throws IOException { + // Skip hosts that we've visited many times. + AtomicInteger hostnameCount = new AtomicInteger(); + AtomicInteger previous = hostnames.putIfAbsent(url.getHost(), hostnameCount); + if (previous != null) hostnameCount = previous; + if (hostnameCount.incrementAndGet() > 100) return; + Request request = new Request.Builder() .url(url) .build(); Response response = client.newCall(request).execute(); String responseSource = response.networkResponse() != null - ? ("(network: " + response.networkResponse().code() + ")") + ? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")") : "(cache)"; int responseCode = response.code(); @@ -96,7 +106,7 @@ public final class Crawler { Document document = Jsoup.parse(response.body().string(), url.toString()); for (Element element : document.select("a[href]")) { String href = element.attr("href"); - URL link = parseUrl(url, href); + URL link = parseUrl(response.request().url(), href); if (link != null) queue.add(link); } }