Python scrapy.spiders 模块,Rule() 实例源码

我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用scrapy.spiders.Rule()

项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def __init__(self,rule):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.rule = rule
        self.name = rule.name
        self.allowed_domains = rule.allowed_domains.split(',')
        self.start_urls = rule.start_urls.split(',')
        rule_list = []

        # ??`???`???
        if len(rule.next_page):
            rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))

        rule_list.append(Rule(LinkExtractor(
            allow=rule.allow_url.split(','),
            unique=True),
            follow=True,
            callback='parse_item'))

        self.rules = tuple(rule_list)
        super(ProxySpiderSpider, self).__init__()
项目:newspaper-scraper-couchbase    作者:aleonsan    | 项目源码 | 文件源码
def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs):
        self.term = term
        if newspaper:
            sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']]
        else:
            sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS)
        self.allowed_domains = [source['allowed_domains'] for source in sources]
        self.start_urls = [source['url'] for source in sources]
        self.rules = []
        for source in sources:
            if topic:
                allowed_domain_regex=(source['allowed_subdomains_regex'][topic], )
            else:
               allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items())
            rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex), 
                                                     callback='parse_with_term',
                                                     cb_kwargs={
                                                         'term': self.term,
                                                         'newspaper': newspaper
                                                     },
                                                     follow=True)
            self.rules.append(rule)

        return super(NewspaperCrawler, self).__init__(*args, **kwargs)
项目:ahmia-crawler    作者:ahmia    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        self.rules = [Rule(self.get_link_extractor(),
                           callback=self.parse_item,
                           process_links=self.limit_links,
                           follow=True)]
        super(WebSpider, self).__init__(*args, **kwargs)
        target_sites = settings.get('TARGET_SITES')
        if target_sites and os.path.isfile(target_sites):
            # Read a list of URLs from file
            # Create the target file list
            with open(target_sites) as target_sites_file:
                # Make it to Python list
                self.start_urls = target_sites_file.read().splitlines()
                # Remove empty strings
                self.start_urls = [u for u in self.start_urls if u]
        else:
            self.start_urls = self.default_start_url
项目:spiders    作者:poodarchu    | 项目源码 | 文件源码
def __init__(self, forum_id=58, digit=1, *args, **kwargs):
        self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
        self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),]
        super(sisSpider, self).__init__(*args, **kwargs)
项目:dancedeets-monorepo    作者:mikelambert    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        self.rules = (
            spiders.Rule(SameBaseDomainLinkExtractor(allowed_domains=self.allowed_domains), callback=self._parse_contents, follow=True),
        )
        logging.getLogger('scrapy.core.engine').setLevel(logging.INFO)
        logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.INFO)
        logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(logging.INFO)

        # We must set up self.rules before calling super, since super calls _compile_rules().
        super(AllStudiosScraper, self).__init__(*args, **kwargs)
项目:hoaxy-backend    作者:IUNetSci    | 项目源码 | 文件源码
def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for SiteSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of sitemap URLS of the site.
        href_xpaths : list
            A list of XPATH expression indicating the ancestors of `<a>`
            element.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.
        """
        self.session = kwargs.pop('session', None)
        self.platform_id = kwargs.pop('platform_id', None)
        self.url_regex = kwargs.pop('url_regex', None)
        self.href_xpaths = kwargs.pop('href_xpaths', ())
        self.start_urls = urls
        self.allowed_domains = domains
        self.rules = (Rule(
            LinkExtractor(
                allow_domains=self.allowed_domains,
                restrict_xpaths=self.href_xpaths,
                unique=True),
            callback="parse_item",
            follow=True),)

        super(SiteSpider, self).__init__(*args, **kwargs)