我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用scrapy.spiders.Rule()。
def __init__(self,rule): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.rule = rule self.name = rule.name self.allowed_domains = rule.allowed_domains.split(',') self.start_urls = rule.start_urls.split(',') rule_list = [] # ??`???`??? if len(rule.next_page): rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True)) rule_list.append(Rule(LinkExtractor( allow=rule.allow_url.split(','), unique=True), follow=True, callback='parse_item')) self.rules = tuple(rule_list) super(ProxySpiderSpider, self).__init__()
def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs): self.term = term if newspaper: sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']] else: sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS) self.allowed_domains = [source['allowed_domains'] for source in sources] self.start_urls = [source['url'] for source in sources] self.rules = [] for source in sources: if topic: allowed_domain_regex=(source['allowed_subdomains_regex'][topic], ) else: allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items()) rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex), callback='parse_with_term', cb_kwargs={ 'term': self.term, 'newspaper': newspaper }, follow=True) self.rules.append(rule) return super(NewspaperCrawler, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): self.rules = [Rule(self.get_link_extractor(), callback=self.parse_item, process_links=self.limit_links, follow=True)] super(WebSpider, self).__init__(*args, **kwargs) target_sites = settings.get('TARGET_SITES') if target_sites and os.path.isfile(target_sites): # Read a list of URLs from file # Create the target file list with open(target_sites) as target_sites_file: # Make it to Python list self.start_urls = target_sites_file.read().splitlines() # Remove empty strings self.start_urls = [u for u in self.start_urls if u] else: self.start_urls = self.default_start_url
def __init__(self, forum_id=58, digit=1, *args, **kwargs): self.start_urls = [self.ip_format % d for d in [int(forum_id)]] self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),] super(sisSpider, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): self.rules = ( spiders.Rule(SameBaseDomainLinkExtractor(allowed_domains=self.allowed_domains), callback=self._parse_contents, follow=True), ) logging.getLogger('scrapy.core.engine').setLevel(logging.INFO) logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.INFO) logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(logging.INFO) # We must set up self.rules before calling super, since super calls _compile_rules(). super(AllStudiosScraper, self).__init__(*args, **kwargs)
def __init__(self, domains, urls, *args, **kwargs): """Constructor for SiteSpider. Parameters ---------- domains : list A list of domains for the site. urls : list A list of sitemap URLS of the site. href_xpaths : list A list of XPATH expression indicating the ancestors of `<a>` element. url_regex : string URL pattern regular expression. If you use this spider to store item into database, additional keywords are required: platform_id : int The id of a platform instance. session : object An instance of SQLAlchemy session. """ self.session = kwargs.pop('session', None) self.platform_id = kwargs.pop('platform_id', None) self.url_regex = kwargs.pop('url_regex', None) self.href_xpaths = kwargs.pop('href_xpaths', ()) self.start_urls = urls self.allowed_domains = domains self.rules = (Rule( LinkExtractor( allow_domains=self.allowed_domains, restrict_xpaths=self.href_xpaths, unique=True), callback="parse_item", follow=True),) super(SiteSpider, self).__init__(*args, **kwargs)