mirror of
https://github.com/square/okhttp.git
synced 2026-01-17 08:42:25 +03:00
Fix some crawler bugs.
We were interpretting links relative to the URLs requested, rather than the redirect targets of those URLs.
This commit is contained in:
@@ -19,6 +19,7 @@ import com.squareup.okhttp.Cache;
|
||||
import com.squareup.okhttp.OkHttpClient;
|
||||
import com.squareup.okhttp.Request;
|
||||
import com.squareup.okhttp.Response;
|
||||
import com.squareup.okhttp.internal.NamedRunnable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
@@ -26,9 +27,11 @@ import java.net.URL;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -40,6 +43,7 @@ public final class Crawler {
|
||||
private final OkHttpClient client;
|
||||
private final Set<URL> fetchedUrls = Collections.synchronizedSet(new LinkedHashSet<URL>());
|
||||
private final LinkedBlockingQueue<URL> queue = new LinkedBlockingQueue<>();
|
||||
private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
|
||||
|
||||
public Crawler(OkHttpClient client) {
|
||||
this.client = client;
|
||||
@@ -48,8 +52,8 @@ public final class Crawler {
|
||||
private void parallelDrainQueue(int threadCount) {
|
||||
ExecutorService executor = Executors.newFixedThreadPool(threadCount);
|
||||
for (int i = 0; i < threadCount; i++) {
|
||||
executor.execute(new Runnable() {
|
||||
@Override public void run() {
|
||||
executor.execute(new NamedRunnable("Crawler %s", i) {
|
||||
@Override protected void execute() {
|
||||
try {
|
||||
drainQueue();
|
||||
} catch (Exception e) {
|
||||
@@ -76,12 +80,18 @@ public final class Crawler {
|
||||
}
|
||||
|
||||
public void fetch(URL url) throws IOException {
|
||||
// Skip hosts that we've visited many times.
|
||||
AtomicInteger hostnameCount = new AtomicInteger();
|
||||
AtomicInteger previous = hostnames.putIfAbsent(url.getHost(), hostnameCount);
|
||||
if (previous != null) hostnameCount = previous;
|
||||
if (hostnameCount.incrementAndGet() > 100) return;
|
||||
|
||||
Request request = new Request.Builder()
|
||||
.url(url)
|
||||
.build();
|
||||
Response response = client.newCall(request).execute();
|
||||
String responseSource = response.networkResponse() != null
|
||||
? ("(network: " + response.networkResponse().code() + ")")
|
||||
? ("(network: " + response.networkResponse().code() + " over " + response.protocol() + ")")
|
||||
: "(cache)";
|
||||
int responseCode = response.code();
|
||||
|
||||
@@ -96,7 +106,7 @@ public final class Crawler {
|
||||
Document document = Jsoup.parse(response.body().string(), url.toString());
|
||||
for (Element element : document.select("a[href]")) {
|
||||
String href = element.attr("href");
|
||||
URL link = parseUrl(url, href);
|
||||
URL link = parseUrl(response.request().url(), href);
|
||||
if (link != null) queue.add(link);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user