我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用scrapy.exceptions.NotConfigured()。
def spider_opened(self, spider): try: file = open(spider.settings.get('FEED_FILE'), 'wb') except TypeError: raise NotConfigured('FEED_FILE parameter does not string or does not exist') except (IOError, OSError) as e: raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e)) self.files[spider] = file feed_title = spider.settings.get('FEED_TITLE') if not feed_title: raise NotConfigured('FEED_TITLE parameter does not exist') feed_link = spider.settings.get('FEED_LINK') if not feed_link: raise NotConfigured('FEED_LINK parameter does not exist') feed_description = spider.settings.get('FEED_DESCRIPTION') if feed_description is None: raise NotConfigured('FEED_DESCRIPTION parameter does not exist') feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter) if isinstance(feed_exporter, six.string_types): feed_exporter = load_object(feed_exporter) if not issubclass(feed_exporter, RssItemExporter): raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter)) self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description) self.exporters[spider].start_exporting()
def from_crawler(cls, crawler): s = crawler.settings proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None) if proxy_path is not None: with codecs.open(proxy_path, 'r', encoding='utf8') as f: proxy_list = [line.strip() for line in f if line.strip()] else: proxy_list = s.getlist('ROTATING_PROXY_LIST') if not proxy_list: raise NotConfigured() mw = cls( proxy_list=proxy_list, logstats_interval=s.getfloat('ROTATING_PROXY_LOGSTATS_INTERVAL', 30), stop_if_no_proxies=s.getbool('ROTATING_PROXY_CLOSE_SPIDER', False), max_proxies_to_try=s.getint('ROTATING_PROXY_PAGE_RETRY_TIMES', 5), backoff_base=s.getfloat('ROTATING_PROXY_BACKOFF_BASE', 300), backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600) ) crawler.signals.connect(mw.engine_started, signal=signals.engine_started) crawler.signals.connect(mw.engine_stopped, signal=signals.engine_stopped) return mw
def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. # s = cls() # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # return s user_agents = crawler.settings.get('USER_AGENT_CHOICES', []) if not user_agents: raise NotConfigured("USER_AGENT_CHOICES not set or empty") o = cls(user_agents) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o
def from_crawler(cls, crawler): m = cls() if not crawler.settings.getbool('SELENIUM_ENABLED'): raise NotConfigured() crawler.signals.connect(m.spider_closed, signal=signals.spider_closed) return m
def from_crawler(cls, crawler): try: return cls(crawler) except Exception as e: raise NotConfigured('WEBDRIVER_BROWSER is misconfigured: %r (%r)' % (crawler.settings.get('WEBDRIVER_BROWSER'), e))
def test_empty_feed(self): for partial_settings in itertools.chain.from_iterable( itertools.combinations(self.feed_settings.items(), r) for r in range(1, len(self.feed_settings))): partial_settings = dict(partial_settings) undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)] with self.assertRaisesRegexp(NotConfigured, '({})'.format('|'.join(undefined_settings)) if len(undefined_settings) > 1 else undefined_settings[0], msg='The feed file, title, link and description must be specified, but the absence of {} is allowed' .format(undefined_settings)): with CrawlerContext(**partial_settings): pass with self.assertRaises(CloseSpider): feed_settings = dict(self.feed_settings) feed_settings['feed_file'] = 'non/existent/filepath' with CrawlerContext(**feed_settings): pass with CrawlerContext(**self.feed_settings): pass with open(self.feed_settings['feed_file']) as data, \ open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected: self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
def __init__(self): # Open database connection self.db = mysql.connect(host=ROJAK_DB_HOST, port=ROJAK_DB_PORT, user=ROJAK_DB_USER, passwd=ROJAK_DB_PASS, db=ROJAK_DB_NAME) self.cursor = self.db.cursor() self.media = {} try: # Get media information from the database self.logger.info('Fetching media information') self.cursor.execute(sql_get_media, [self.name]) row = self.cursor.fetchone() self.media['id'] = row[0] self.media['last_scraped_at'] = row[1] except mysql.Error as err: self.logger.error('Unable to fetch media data: %s', err) raise NotConfigured('Unable to fetch media data: %s' % err) if ROJAK_SLACK_TOKEN != '': self.is_slack = True self.slack = Slacker(ROJAK_SLACK_TOKEN) else: self.is_slack = False self.logger.info('Post error to #rojak-pantau-errors is disabled') # Capture the signal spider_opened and spider_closed # https://doc.scrapy.org/en/latest/topics/signals.html
def start_requests(self): if not self.page_clf and self.settings.get( 'QUEUE_MAX_RELEVANT_DOMAINS'): raise NotConfigured('Pass page_clf to spider') for request in super().start_requests(): request.priority = self.initial_priority if self.queue is not None: self.queue.push(request) else: yield request
def from_crawler(cls, crawler) -> 'RequestLogMiddleware': log_path = crawler.settings.get('RESPONSE_LOG_FILE') if not log_path: raise NotConfigured('RESPONSE_LOG_FILE not defined') jl_logger = get_jl_logger(log_path) threshold = crawler.settings.getfloat('PAGE_RELEVANCY_THRESHOLD', 0.5) return cls(jl_logger=jl_logger, relevancy_threshold=threshold)
def from_crawler(cls, crawler): if crawler.settings.getbool('DOMAIN_LIMIT'): log_path = crawler.settings.get('RESPONSE_LOG_FILE') if not log_path: raise NotConfigured('RESPONSE_LOG_FILE not defined') mw = cls(get_jl_logger(log_path)) crawler.signals.connect(mw.on_queues_changed, queues_changed) return mw
def from_crawler(cls, crawler): path_segments = crawler.settings.getint('MAX_DUPLICATE_PATH_SEGMENTS') query_segments = crawler.settings.getint('MAX_DUPLICATE_QUERY_SEGMENTS') if not (path_segments or query_segments): raise NotConfigured() return cls(path_segments, query_segments, crawler.stats)
def from_crawler(cls, crawler): splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url) log_400 = crawler.settings.getbool('SPLASH_LOG_400', True) slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY', cls.default_policy) if slot_policy not in SlotPolicy._known: raise NotConfigured("Incorrect slot policy: %r" % slot_policy) return cls(crawler, splash_base_url, slot_policy, log_400)