我被这个错误困扰了一段时间,以下错误消息如下:
File "C:\Python27\lib\site-packages\scrapy-0.20.2-py2.7.egg\scrapy\http\request\__init__.py", line 61, in _set_url raise ValueError('Missing scheme in request url: %s' % self._url) exceptions.ValueError: Missing scheme in request url: h
Scrapy code:
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from scrapy.http import Request from spyder.items import SypderItem import sys import MySQLdb import hashlib from scrapy import signals from scrapy.xlib.pydispatch import dispatcher # _*_ coding: utf-8 _*_ class some_Spyder(CrawlSpider): name = "spyder" def __init__(self, *a, **kw): # catch the spider stopping # dispatcher.connect(self.spider_closed, signals.spider_closed) # dispatcher.connect(self.on_engine_stopped, signals.engine_stopped) self.allowed_domains = "domainname.com" self.start_urls = "http://www.domainname.com/" self.xpaths = '''//td[@class="CatBg" and @width="25%" and @valign="top" and @align="center"] /table[@cellspacing="0"]//tr/td/a/@href''' self.rules = ( Rule(SgmlLinkExtractor(restrict_xpaths=(self.xpaths))), Rule(SgmlLinkExtractor(allow=('cart.php?')), callback='parse_items'), ) super(spyder, self).__init__(*a, **kw) def parse_items(self, response): sel = Selector(response) items = [] listings = sel.xpath('//*[@id="tabContent"]/table/tr') item = IgeItem() item["header"] = sel.xpath('//td[@valign="center"]/h1/text()') items.append(item) return items
我很确定这与我要求scrapy在LinkExtractor中跟随的URL有关。当将它们提取到shell中时,它们看起来像这样:
data=u'cart.php?target=category&category_id=826'
与从工作的蜘蛛中提取的另一个URL相比:
data=u'/path/someotherpath/category.php?query=someval'
我看过一些有关Stack Overflow的问题,例如下载抓取的图片,但从阅读的角度来看,我认为我可能会有一个稍微不同的问题。
更改start_urls为:
start_urls
self.start_urls = ["http://www.bankofwow.com/"]