mirror of
https://github.com/square/okhttp.git
synced 2025-11-26 06:43:09 +03:00
Make the crawler a little better.
Name threads by their URLs. Don't follow all fragments of the same URL.
This commit is contained in:
@@ -72,10 +72,15 @@ public final class Crawler {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Thread currentThread = Thread.currentThread();
|
||||||
|
String originalName = currentThread.getName();
|
||||||
|
currentThread.setName("Crawler " + url.toString());
|
||||||
try {
|
try {
|
||||||
fetch(url);
|
fetch(url);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
System.out.printf("XXX: %s %s%n", url, e);
|
System.out.printf("XXX: %s %s%n", url, e);
|
||||||
|
} finally {
|
||||||
|
currentThread.setName(originalName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -114,7 +119,8 @@ public final class Crawler {
|
|||||||
for (Element element : document.select("a[href]")) {
|
for (Element element : document.select("a[href]")) {
|
||||||
String href = element.attr("href");
|
String href = element.attr("href");
|
||||||
HttpUrl link = response.request().url().resolve(href);
|
HttpUrl link = response.request().url().resolve(href);
|
||||||
if (link != null) queue.add(link);
|
if (link == null) continue; // URL is either invalid or its scheme isn't http/https.
|
||||||
|
queue.add(link.newBuilder().fragment(null).build());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user