1
0
mirror of https://github.com/square/okhttp.git synced 2025-11-27 18:21:14 +03:00

Make the crawler a little better.

Name threads by their URLs. Don't follow all fragments of the same URL.
This commit is contained in:
jwilson
2017-01-07 15:07:46 -05:00
parent de8699b62d
commit ca76167504

View File

@@ -72,10 +72,15 @@ public final class Crawler {
continue; continue;
} }
Thread currentThread = Thread.currentThread();
String originalName = currentThread.getName();
currentThread.setName("Crawler " + url.toString());
try { try {
fetch(url); fetch(url);
} catch (IOException e) { } catch (IOException e) {
System.out.printf("XXX: %s %s%n", url, e); System.out.printf("XXX: %s %s%n", url, e);
} finally {
currentThread.setName(originalName);
} }
} }
} }
@@ -114,7 +119,8 @@ public final class Crawler {
for (Element element : document.select("a[href]")) { for (Element element : document.select("a[href]")) {
String href = element.attr("href"); String href = element.attr("href");
HttpUrl link = response.request().url().resolve(href); HttpUrl link = response.request().url().resolve(href);
if (link != null) queue.add(link); if (link == null) continue; // URL is either invalid or its scheme isn't http/https.
queue.add(link.newBuilder().fragment(null).build());
} }
} }