Python scrapy.signals 模块,spider_closed() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.signals.spider_closed()

项目:scrapyjiji    作者:sbourdelin    | 项目源码 | 文件源码
def spider_closed(self, spider):
        """Handle the spider_closed event to save the map"""

        # create the special marker for all the ads without geocode
        print "found %d items without geocode" % (len(self.no_geocode))
        if len(self.no_geocode) > 0:
            html = ""
            for x in self.no_geocode:
                html += "<a href=%s target=_blank>%s</a> : %s<br>" % (x["url"], x["title"], x["price"])
            iframe  = folium.element.IFrame(html=html, width=500, height=100)
            popup   = folium.Popup(iframe, max_width=500)
            folium.Marker(MAP_LATLNG,
                          popup=popup,
                          icon=folium.Icon()).add_to(self.m_map)

        print "found %d new items" % (self.new_items)
        pickle.dump(self.m_list, open(DATABASE, 'wb'))
        self.m_map.save('map.html')
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def __init__(self,rule):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.rule = rule
        self.name = rule.name
        self.allowed_domains = rule.allowed_domains.split(',')
        self.start_urls = rule.start_urls.split(',')
        rule_list = []

        # ??`???`???
        if len(rule.next_page):
            rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))

        rule_list.append(Rule(LinkExtractor(
            allow=rule.allow_url.split(','),
            unique=True),
            follow=True,
            callback='parse_item'))

        self.rules = tuple(rule_list)
        super(ProxySpiderSpider, self).__init__()
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        try:
            output_path = (
                crawler.settings.get('FEEDS_CONFIG')['feeds']['output_path']
            )
        except (KeyError, TypeError):
            output_path = 'output'
        try:
            output_url = (
                crawler.settings.get('FEEDS_CONFIG')['feeds']['output_url']
            )
        except (KeyError, TypeError):
            output_url = None
        pipeline = cls(output_path=output_path, output_url=output_url)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if isinstance(item, AirbnbItem):
            self.room_count += 1
            if self.room_count > 100000:
                self.room_count = 0
                self.room_file_count += 1
                self.spider_closed(spider, mode=1)
                self.spider_opened(spider, mode=1)
            self.exporter_room.export_item(item)
        elif isinstance(item, UserItem):
            self.user_count += 1
            if self.user_count > 100000:
                self.user_count = 0
                self.user_file_count += 1
                self.spider_closed(spider, mode=2)
                self.spider_opened(spider, mode=2)
            self.exporter_user.export_item(item)
        else:
            logger.info('Some error happened!')
项目:Charlie    作者:nxintech    | 项目源码 | 文件源码
def run_spider():
    settings = Settings()
    settings.set('ITEM_PIPELINES', {
        '__main__.JsonWriterPipeline': 100
    })

    # enable remote sever certificate verification
    # see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory
    settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY',
                 'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'
                 )

    # uncomment below line to enable the logging for debug
    # configure_logging()

    crawler = Crawler(JenkinsJobSpider, settings)
    crawler.signals.connect(callback, signal=signals.spider_closed)
    crawler.crawl()
    reactor.run()
项目:alltheplaces    作者:alltheplaces    | 项目源码 | 文件源码
def spider_closed(spider):
        spider_stats[spider.name] = {
            'finish_reason': spider.crawler.stats.get_value('finish_reason'),
            'duration': (
                spider.crawler.stats.get_value('finish_time') -
                spider.crawler.stats.get_value('start_time')).total_seconds(),
            'item_scraped_count':
                spider.crawler.stats.get_value('item_scraped_count'),
        }

        print("Spider %s closed (%s) after %0.1f sec, %d items" % (
            spider.name,
            spider.crawler.stats.get_value('finish_reason'),
            (spider.crawler.stats.get_value('finish_time') -
                spider.crawler.stats.get_value('start_time')).total_seconds(),
            spider.crawler.stats.get_value('item_scraped_count') or 0,
        ))
项目:scrapy_site    作者:hl10502    | 项目源码 | 文件源码
def __init__(self):
        dispatcher.connect(self.spider_opended, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        dispatcher.connect(self.engine_stopped, signals.engine_stopped)
        dispatcher.connect(self.engine_started, signals.engine_started)

        # ????????????scrapy_site??????
        self.curpath = os.getcwd()
        #?????????????
        self.spidername_filepath = self.curpath + "/scrapy_site/msg/"

        # ?????keyword.conf????????
        self.keywordsDict = dict()
        self.getKeywords()

        #????????????
        self.webnamesDict = dict()
        self.getWebnames()

        # ????
        self.msgDict = dict()

        SavePipeline.initCount = SavePipeline.initCount + 1
项目:ArticleSpider    作者:mtianyan    | 项目源码 | 文件源码
def __init__(self):
        self.fail_urls=[]
        dispatcher.connect(self.handle_spider_cosed, signals.spider_closed)
项目:NetEaseMusicCrawler    作者:yaochao    | 项目源码 | 文件源码
def __init__(self, settings):
        self.options = settings.get('PHANTOMJS_OPTIONS', {})  # ???
        max_run = settings.get('PHANTOMJS_MAXRUN', 10)  # PhantomJS ???????????, ??10
        self.sem = defer.DeferredSemaphore(max_run)
        self.queue = Queue.LifoQueue(maxsize=max_run)  # LifoQueue ??????
        SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def __init__(self,*a, **kw):
        super(StackSpider,self).__init__(*a, **kw)
        self.time = datetime.datetime.now()
        self.congress = Congress()
        self.members = self.congress.searchAll("diputados")
        self.groups = self.congress.searchAll("grupos")
        dispatcher.connect(self.whenFinish, signals.spider_closed)
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def __init__(self, crawler):
        self.crawler = crawler
        self.initiatives = 0
        self.amendments = 0
        self.finishtext = 0
        self.responses = 0
        self.members = 0
                # connect the extension object to signals
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
项目:tipi-engine    作者:CIECODE-Madrid    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.crawler.stats.set_value('item/initiatives', self.initiatives)
        self.crawler.stats.set_value('item/amendments', self.amendments)
        self.crawler.stats.set_value('item/finishtext', self.finishtext)
        self.crawler.stats.set_value('item/responses', self.responses)
        self.crawler.stats.set_value('item/members', self.responses)
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
项目:scrapyjiji    作者:sbourdelin    | 项目源码 | 文件源码
def __init__(self, *a, **kw):
        """Attach a callback to the spider_closed signal"""
        super(Kijiji, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        if USE_DB is True:
            self.open_database()
            if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False:
                # add already know marker
                for x in self.m_list:
                    self.add_marker(x, False)
项目:weather    作者:awolfly9    | 项目源码 | 文件源码
def __init__(self, *a, **kw):
        super(TianqiSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self.sql = SqlHelper()
        self.weather_table_name = config.weather_table
        self.citys = []

        self.init()
项目:weather    作者:awolfly9    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.log('spider_closed ???????????????')
项目:ssp-transparencia    作者:eltermann    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:ssp-transparencia    作者:eltermann    | 项目源码 | 文件源码
def spider_closed(self, spider):
        for exporter in self.exporters.values():
            exporter.finish_exporting()
        for file in self.files:
            file.close()
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        ext = cls(crawler.stats)
        crawler.signals.connect(ext.spider_opened,
                                signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed,
                                signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped,
                                signal=signals.item_scraped)

        return ext
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def spider_closed(self, spider):
        value = self.stats.get_value('item_scraped_count',
                                     0)
        save_stats(spider.settings['SPIDER_STATS_URL'],
                   spider._id,
                   value)
        if spider.settings['BOT_NAME'] != 'TestSpider':
            logger.info('spider[%s] crawled %d articles',
                        spider.name,
                        value)
            if value == 0:
                update_spider_stats(spider,
                                    {'fail': 1})
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def spider_closed(self, spider):
        print "spider is closed!"
        session = loadSession()
        log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id
                                                   and SpiderCrawlLog.endTime is None
                                                   ).first()
        log.endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        session.commit()

        pass
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def spider_closed(self, spider):
        file = self.files.pop(spider.name)
        file.close()
        pass
项目:alsam_mi_ki    作者:mersanuzun    | 项目源码 | 文件源码
def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
项目:alsam_mi_ki    作者:mersanuzun    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.file.seek(-2, os.SEEK_END)
        self.file.truncate()
        self.file.write(']')
        self.file.close()
项目:scrapy-training    作者:scrapinghub    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        m = cls()
        if not crawler.settings.getbool('SELENIUM_ENABLED'):
            raise NotConfigured()
        crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
        return m
项目:scrapy-training    作者:scrapinghub    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.driver.close()
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def spider_closed(self, spider):
        # Add feed header(s) at the end so they can be dynamic.
        for feed_header in iterate_spider_output(spider.feed_headers()):
            self._exporters[spider].export_item(feed_header)
        self._exporters[spider].finish_exporting()
        self._exporters.pop(spider)
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json_lines = self.files.pop(spider)
        file_json_lines.close()
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)  # ????????????
        return pipeline
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def spider_closed(self, spider, reason):
        """
        ????????????
        :param spider:
        :param reason: finished/cancelled/shutdown
        :return:
        """
        print time.strftime("%Y-%m-%d %H:%M:%S"), 'StatsPipeline   Signals: spider_closed'
        print spider.crawler.stats.get_stats()
        print spider.crawler.stats.get_value('downloader/request_count', 0)  # ????
        print spider.crawler.stats.get_value('downloader/response_count', 0)  # ????
        print spider.crawler.stats.get_value('response_received_count', 0)  # ??????
        print spider.crawler.stats.get_value('item_dropped_count', 0)  # ??????
        print spider.crawler.stats.get_value('item_scraped_count', 0)  # ??????
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.engine_started, signals.engine_started)  # ????
        crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped)  # ????
        crawler.signals.connect(pipeline.item_scraped, signals.item_scraped)  # ??????????
        crawler.signals.connect(pipeline.item_dropped, signals.item_dropped)  # ??????????
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)  # ????????????
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)  # ????????????
        crawler.signals.connect(pipeline.spider_idle, signals.spider_idle)      # ????
        crawler.signals.connect(pipeline.spider_error, signals.spider_error)    # ????
        crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled)    # ??????
        crawler.signals.connect(pipeline.request_dropped, signals.request_dropped)    # ??????
        crawler.signals.connect(pipeline.response_received, signals.response_received)    # ????
        crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded)    # ????
        return pipeline
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_xml = self.files.pop(spider)
        file_xml.close()
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        super(FullDomainSpider, self).__init__(*args, **kwargs)
        self.allowed_domains = kwargs.get('allowed_domains').split(',')
        self.org = kwargs.get('org')
        self.start_urls = kwargs.get('start_urls').split(',')
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def spider_closed(self, spider):
        try:
            self.conn.close()
        except:
            los.msg("Could not close database connection", level=log.ERROR)
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporters[spider].finish_exporting()
        file = self.files.pop(spider)
        file.close()
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def __exit__(self, exc_type, exc_val, exc_tb):
        responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed,
                                                        spider=self.spider, reason=None)
        for _, failure in responses:
            if failure:
                failure.raiseException()
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def process_item(self, item, spider):
        self.count += 1
        if self.count > 1000:
            self.count = 0
            self.file_count += 1
            self.spider_closed()
            self.spider_opened()
        self.exporter.export_item(item)
        return item
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline