from scrapy.spider import Spider from scrapy.selector import Selector from dirbot.items import Website class DmozSpider(Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/", ] def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') items.append(item) return items
但是为什么只抓取这两个网页呢?我看到了, allowed_domains = ["dmoz.org"]但是这两页还包含指向dmoz.org域内其他页面的链接!为什么它也不会抓取它们?
allowed_domains = ["dmoz.org"]
dmoz.org
start_urlsclass属性包含起始网址-仅此而已。如果你要提取其他网页的网址,parse请使用[another]回调从相应的回调请求中获取收益:
start_urlsclass
parse
class Spider(BaseSpider): name = 'my_spider' start_urls = [ 'http://www.domain.com/' ] allowed_domains = ['domain.com'] def parse(self, response): '''Parse main page and extract categories links.''' hxs = HtmlXPathSelector(response) urls = hxs.select("//*[@id='tSubmenuContent']/a[position()>1]/@href").extract() for url in urls: url = urlparse.urljoin(response.url, url) self.log('Found category url: %s' % url) yield Request(url, callback = self.parseCategory) def parseCategory(self, response): '''Parse category page and extract links of the items.''' hxs = HtmlXPathSelector(response) links = hxs.select("//*[@id='_list']//td[@class='tListDesc']/a/@href").extract() for link in links: itemLink = urlparse.urljoin(response.url, link) self.log('Found item link: %s' % itemLink, log.DEBUG) yield Request(itemLink, callback = self.parseItem) def parseItem(self, response): ...