1
0
mirror of https://github.com/square/okhttp.git synced 2026-01-17 08:42:25 +03:00

Fix some crawler bugs.

We were interpretting links relative to the URLs requested, rather
than the redirect targets of those URLs.
This commit is contained in:
jwilson
2015-03-15 22:02:37 -04:00
parent 20a27d8163
commit a75193b134

View File

@@ -19,6 +19,7 @@ import com.squareup.okhttp.Cache;
import com.squareup.okhttp.OkHttpClient;
import com.squareup.okhttp.Request;
import com.squareup.okhttp.Response;
import com.squareup.okhttp.internal.NamedRunnable;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
@@ -26,9 +27,11 @@ import java.net.URL;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -40,6 +43,7 @@ public final class Crawler {
private final OkHttpClient client;
private final Set<URL> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<URL>());
private final LinkedBlockingQueue<URL> queue = new LinkedBlockingQueue<>();
private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
public Crawler(OkHttpClient client) {
this.client = client;
@@ -48,8 +52,8 @@ public final class Crawler {
private void parallelDrainQueue(int threadCount) {
ExecutorService executor = Executors.newFixedThreadPool(threadCount);
for (int i = 0; i < threadCount; i++) {
executor.execute(new Runnable() {
@Override public void run() {
executor.execute(new NamedRunnable("Crawler %s", i) {
@Override protected void execute() {
try {
drainQueue();
} catch (Exception e) {
@@ -76,12 +80,18 @@ public final class Crawler {
}
public void fetch(URL url) throws IOException {
// Skip hosts that we've visited many times.
AtomicInteger hostnameCount = new AtomicInteger();
AtomicInteger previous = hostnames.putIfAbsent(url.getHost(), hostnameCount);
if (previous != null) hostnameCount = previous;
if (hostnameCount.incrementAndGet() > 100) return;
Request request = new Request.Builder()
.url(url)
.build();
Response response = client.newCall(request).execute();
String responseSource = response.networkResponse() != null
? ("(network: " + response.networkResponse().code() + ")")
? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
: "(cache)";
int responseCode = response.code();
@@ -96,7 +106,7 @@ public final class Crawler {
Document document = Jsoup.parse(response.body().string(), url.toString());
for (Element element : document.select("a[href]")) {
String href = element.attr("href");
URL link = parseUrl(url, href);
URL link = parseUrl(response.request().url(), href);
if (link != null) queue.add(link);
}
}