Python scrapy.item 模块,Item() 实例源码

我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用scrapy.item.Item()

项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item Extraido:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(BloggerSpider())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"
项目:crawlers    作者:pyjobs    | 项目源码 | 文件源码
def _parse_spider_response(self, spider_response):
        """
        :param spider_response: return of parse spider method
        :return: job item generator
        """



        for response_item in spider_response:
            if isinstance(response_item, Request):
                request = response_item
                file_path = self._dump_format % request.url.replace(self._replace, self._dump_dir)
                if file_path.find('file://') != -1:
                    file_path = file_path.replace('file://', '')
                response = fake_response_from_file(
                        file_path=file_path,
                        request=request,
                        response_class=HtmlResponse
                )
                # If a callback it's a job page request
                if request.callback:
                    for item in request.callback(response):
                        yield item
                # Else it's a next page
                else:
                    for job_item in self._parse_spider_response(self._spider.parse(response)):
                        yield job_item

            elif isinstance(response_item, Item):
                yield response_item
项目:scrapy-itemloader    作者:scrapy    | 项目源码 | 文件源码
def _get_item_field_attr(self, field_name, key, default=None):
        if isinstance(self.item, Item):
            value = self.item.fields[field_name].get(key, default)
        else:
            value = default
        return value
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def parse_blog(self, response):
        print 'link parseado %s' %response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo
        item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha
        return item # Retornando el Item.