我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用scrapy.signals.spider_idle()。
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from redis list '%s'" % self.redis_key)
def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server, self.redis_server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def spider_idle(self): """ Waits for request to be scheduled. :return: None """ self.crawler.engine.slot.scheduler.next_request() raise DontCloseSpider
def dm_setup(self): """ Set method for spider idle state. It's implemented this way to support one and many instances of the mixin. """ dispatcher.connect( self.dequeue_next_page_requests, signal=signals.spider_idle ) self._was_setup_called = True
def dm_teardown(self): """ Disconnect the method from the signal. It's done to avoid conflicts when many instances of the mixin are being executed. """ try: dispatcher.disconnect( self.dequeue_next_page_requests, signal=signals.spider_idle ) except DispatcherKeyError: pass
def from_crawler(cls, crawler, *args, **kwargs): spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider
def spider_idle(self): self.log("Spider idle signal caught.") raise DontCloseSpider
def spider_idle(self): """Schedules a request if available, otherwise waits.""" # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() raise DontCloseSpider
def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ???? crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ???? crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ?????????? crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ?????????? crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ???????????? crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ???????????? crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ???? crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ???? crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ?????? crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ?????? crawler.signals.connect(pipeline.response_received, signals.response_received) # ???? crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ???? return pipeline
def spider_idle(self, spider): """ ???? :param spider: :return: """ print time.strftime("%Y-%m-%d %H:%M:%S"), 'Pipeline Signals: spider_idle' pass
def _set_crawler(self, crawler): super(StructureSpider, self)._set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def spider_idle(self): if self.settings.getbool("IDLE", True): print('Don\'t close spider......') raise DontCloseSpider
def set_crawler(self, crawler): super(RedisSpider, self).set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def spider_idle(self): raise DontCloseSpider
def from_crawler(cls, crawler, *args, **kwargs): spider = super(WebSpider, cls).from_crawler(crawler, *args, **kwargs) if settings.get('FULL_PAGERANK_COMPUTE', False): crawler.signals.connect(spider.on_idle, signals.spider_idle) return spider
def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE, ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s)", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)