Crawler4j是一个开源的Java类库提供一个用于抓取Web页面的简单接口。可以利用它来构建一个多线程的Web爬虫。
示例代码:
import java.util.ArrayList; import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.url.WebURL;
public class MyCrawler extends WebCrawler {
Pattern filters = Pattern.compile(“.*(\\.(css|js|bmp|gif|jpe?g” + “|png|tiff?|mid|mp2|mp3|mp4” + “|wav|avi|mov|mpeg|ram|m4v|pdf” + “|rm|smil|wmv|swf|wma|zip|rar|gz))$”);
public MyCrawler() { }
public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (filters.matcher(href).matches()) { return false; } if (href.startsWith(“http://www.ics.uci.edu/")) { return true; } return false; }
public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); String text = page.getText(); ArrayList links = page.getURLs(); } }