我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用scrapy.item.Item()。
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(EuropythonSpyder()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() #iniciar el crawler llamando al spider definido print "ENGINE STOPPED"
def main(): from scrapy.xlib.pydispatch import dispatcher """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print "Item extracted:", item dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define spyder for the crawler crawler.crawl(PydataSpiderDetails()) print "STARTING ENGINE" crawler.start() #start the crawler print "ENGINE STOPPED"
def _parse_spider_response(self, spider_response): """ :param spider_response: return of parse spider method :return: job item generator """ for response_item in spider_response: if isinstance(response_item, Request): request = response_item file_path = self._dump_format % request.url.replace(self._replace, self._dump_dir) if file_path.find('file://') != -1: file_path = file_path.replace('file://', '') response = fake_response_from_file( file_path=file_path, request=request, response_class=HtmlResponse ) # If a callback it's a job page request if request.callback: for item in request.callback(response): yield item # Else it's a next page else: for job_item in self._parse_spider_response(self._spider.parse(response)): yield job_item elif isinstance(response_item, Item): yield response_item
def _get_item_field_attr(self, field_name, key, default=None): if isinstance(self.item, Item): value = self.item.fields[field_name].get(key, default) else: value = default return value
def parse_blog(self, response): print 'link parseado %s' %response.url hxs = HtmlXPathSelector(response) item = HackerWayItem() item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha return item # Retornando el Item.