我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.signals.spider_closed()。
def spider_closed(self, spider): """Handle the spider_closed event to save the map""" # create the special marker for all the ads without geocode print "found %d items without geocode" % (len(self.no_geocode)) if len(self.no_geocode) > 0: html = "" for x in self.no_geocode: html += "<a href=%s target=_blank>%s</a> : %s<br>" % (x["url"], x["title"], x["price"]) iframe = folium.element.IFrame(html=html, width=500, height=100) popup = folium.Popup(iframe, max_width=500) folium.Marker(MAP_LATLNG, popup=popup, icon=folium.Icon()).add_to(self.m_map) print "found %d new items" % (self.new_items) pickle.dump(self.m_list, open(DATABASE, 'wb')) self.m_map.save('map.html')
def __init__(self,rule): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.rule = rule self.name = rule.name self.allowed_domains = rule.allowed_domains.split(',') self.start_urls = rule.start_urls.split(',') rule_list = [] # ??`???`??? if len(rule.next_page): rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True)) rule_list.append(Rule(LinkExtractor( allow=rule.allow_url.split(','), unique=True), follow=True, callback='parse_item')) self.rules = tuple(rule_list) super(ProxySpiderSpider, self).__init__()
def from_crawler(cls, crawler): try: output_path = ( crawler.settings.get('FEEDS_CONFIG')['feeds']['output_path'] ) except (KeyError, TypeError): output_path = 'output' try: output_url = ( crawler.settings.get('FEEDS_CONFIG')['feeds']['output_url'] ) except (KeyError, TypeError): output_url = None pipeline = cls(output_path=output_path, output_url=output_url) crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline
def process_item(self, item, spider): if isinstance(item, AirbnbItem): self.room_count += 1 if self.room_count > 100000: self.room_count = 0 self.room_file_count += 1 self.spider_closed(spider, mode=1) self.spider_opened(spider, mode=1) self.exporter_room.export_item(item) elif isinstance(item, UserItem): self.user_count += 1 if self.user_count > 100000: self.user_count = 0 self.user_file_count += 1 self.spider_closed(spider, mode=2) self.spider_opened(spider, mode=2) self.exporter_user.export_item(item) else: logger.info('Some error happened!')
def run_spider(): settings = Settings() settings.set('ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100 }) # enable remote sever certificate verification # see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY', 'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory' ) # uncomment below line to enable the logging for debug # configure_logging() crawler = Crawler(JenkinsJobSpider, settings) crawler.signals.connect(callback, signal=signals.spider_closed) crawler.crawl() reactor.run()
def spider_closed(spider): spider_stats[spider.name] = { 'finish_reason': spider.crawler.stats.get_value('finish_reason'), 'duration': ( spider.crawler.stats.get_value('finish_time') - spider.crawler.stats.get_value('start_time')).total_seconds(), 'item_scraped_count': spider.crawler.stats.get_value('item_scraped_count'), } print("Spider %s closed (%s) after %0.1f sec, %d items" % ( spider.name, spider.crawler.stats.get_value('finish_reason'), (spider.crawler.stats.get_value('finish_time') - spider.crawler.stats.get_value('start_time')).total_seconds(), spider.crawler.stats.get_value('item_scraped_count') or 0, ))
def __init__(self): dispatcher.connect(self.spider_opended, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) dispatcher.connect(self.engine_stopped, signals.engine_stopped) dispatcher.connect(self.engine_started, signals.engine_started) # ????????????scrapy_site?????? self.curpath = os.getcwd() #????????????? self.spidername_filepath = self.curpath + "/scrapy_site/msg/" # ?????keyword.conf???????? self.keywordsDict = dict() self.getKeywords() #???????????? self.webnamesDict = dict() self.getWebnames() # ???? self.msgDict = dict() SavePipeline.initCount = SavePipeline.initCount + 1
def __init__(self): self.fail_urls=[] dispatcher.connect(self.handle_spider_cosed, signals.spider_closed)
def __init__(self, settings): self.options = settings.get('PHANTOMJS_OPTIONS', {}) # ??? max_run = settings.get('PHANTOMJS_MAXRUN', 10) # PhantomJS ???????????, ??10 self.sem = defer.DeferredSemaphore(max_run) self.queue = Queue.LifoQueue(maxsize=max_run) # LifoQueue ?????? SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def __init__(self,*a, **kw): super(StackSpider,self).__init__(*a, **kw) self.time = datetime.datetime.now() self.congress = Congress() self.members = self.congress.searchAll("diputados") self.groups = self.congress.searchAll("grupos") dispatcher.connect(self.whenFinish, signals.spider_closed)
def __init__(self, crawler): self.crawler = crawler self.initiatives = 0 self.amendments = 0 self.finishtext = 0 self.responses = 0 self.members = 0 # connect the extension object to signals crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def spider_closed(self, spider): self.crawler.stats.set_value('item/initiatives', self.initiatives) self.crawler.stats.set_value('item/amendments', self.amendments) self.crawler.stats.set_value('item/finishtext', self.finishtext) self.crawler.stats.set_value('item/responses', self.responses) self.crawler.stats.set_value('item/members', self.responses)
def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline
def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close()
def __init__(self, *a, **kw): """Attach a callback to the spider_closed signal""" super(Kijiji, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) if USE_DB is True: self.open_database() if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False: # add already know marker for x in self.m_list: self.add_marker(x, False)
def __init__(self, *a, **kw): super(TianqiSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) self.sql = SqlHelper() self.weather_table_name = config.weather_table self.citys = [] self.init()
def spider_closed(self, spider): self.log('spider_closed ???????????????')
def spider_closed(self, spider): for exporter in self.exporters.values(): exporter.finish_exporting() for file in self.files: file.close()
def from_crawler(cls, crawler): ext = cls(crawler.stats) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) return ext
def spider_closed(self, spider): value = self.stats.get_value('item_scraped_count', 0) save_stats(spider.settings['SPIDER_STATS_URL'], spider._id, value) if spider.settings['BOT_NAME'] != 'TestSpider': logger.info('spider[%s] crawled %d articles', spider.name, value) if value == 0: update_spider_stats(spider, {'fail': 1})
def spider_closed(self, spider): print "spider is closed!" session = loadSession() log = session.query(SpiderCrawlLog).filter(SpiderCrawlLog.spiderID == self.rule.id and SpiderCrawlLog.endTime is None ).first() log.endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") session.commit() pass
def spider_closed(self, spider): file = self.files.pop(spider.name) file.close() pass
def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider): self.file.seek(-2, os.SEEK_END) self.file.truncate() self.file.write(']') self.file.close()
def from_crawler(cls, crawler): m = cls() if not crawler.settings.getbool('SELENIUM_ENABLED'): raise NotConfigured() crawler.signals.connect(m.spider_closed, signal=signals.spider_closed) return m
def spider_closed(self, spider): self.driver.close()
def spider_closed(self, spider): # Add feed header(s) at the end so they can be dynamic. for feed_header in iterate_spider_output(spider.feed_headers()): self._exporters[spider].export_item(feed_header) self._exporters[spider].finish_exporting() self._exporters.pop(spider)
def spider_closed(self, spider): self.exporter.finish_exporting() file_json_lines = self.files.pop(spider) file_json_lines.close()
def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ???????????? return pipeline
def spider_closed(self, spider, reason): """ ???????????? :param spider: :param reason: finished/cancelled/shutdown :return: """ print time.strftime("%Y-%m-%d %H:%M:%S"), 'StatsPipeline Signals: spider_closed' print spider.crawler.stats.get_stats() print spider.crawler.stats.get_value('downloader/request_count', 0) # ???? print spider.crawler.stats.get_value('downloader/response_count', 0) # ???? print spider.crawler.stats.get_value('response_received_count', 0) # ?????? print spider.crawler.stats.get_value('item_dropped_count', 0) # ?????? print spider.crawler.stats.get_value('item_scraped_count', 0) # ??????
def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ???? crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ???? crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ?????????? crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ?????????? crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ???????????? crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ???????????? crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ???? crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ???? crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ?????? crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ?????? crawler.signals.connect(pipeline.response_received, signals.response_received) # ???? crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ???? return pipeline
def spider_closed(self, spider): self.exporter.finish_exporting() file_csv = self.files.pop(spider) file_csv.close()
def spider_closed(self, spider): self.exporter.finish_exporting() file_xml = self.files.pop(spider) file_xml.close()
def __init__(self, *args, **kwargs): super(FullDomainSpider, self).__init__(*args, **kwargs) self.allowed_domains = kwargs.get('allowed_domains').split(',') self.org = kwargs.get('org') self.start_urls = kwargs.get('start_urls').split(',') dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider): try: self.conn.close() except: los.msg("Could not close database connection", level=log.ERROR)
def spider_closed(self, spider): self.exporters[spider].finish_exporting() file = self.files.pop(spider) file.close()
def __exit__(self, exc_type, exc_val, exc_tb): responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed, spider=self.spider, reason=None) for _, failure in responses: if failure: failure.raiseException()
def process_item(self, item, spider): self.count += 1 if self.count > 1000: self.count = 0 self.file_count += 1 self.spider_closed() self.spider_opened() self.exporter.export_item(item) return item