Python scrapy.exceptions 模块,IgnoreRequest() 实例源码

我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用scrapy.exceptions.IgnoreRequest()

项目:freshonions-torscraper    作者:dirtyfilthy    | 项目源码 | 文件源码
def process_request(self, request, spider):

          # don't use this middleware while testing is site is up
        if hasattr(spider, "test") and spider.test=="yes":
            #logger = logging.getLogger()
            #logger.info("Testing mode, dead domains disabled")
            return None

        if not Domain.is_onion_url(request.url):
            return None

        domain = Domain.find_by_url(request.url)
        if not domain or domain.is_up:
            return None

        raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
项目:freshonions-torscraper    作者:dirtyfilthy    | 项目源码 | 文件源码
def process_request(self, request, spider): 
        parsed_url = urlparse.urlparse(request.url)

        if not self.test_mode or not parsed_url.path in ["/", ""]:
            return None

        if not Domain.is_onion_url(request.url):
            return None

        d = Domain.find_by_url(request.url)

        if d is None:
            return None

        now = datetime.now()

        if now > d.next_scheduled_check:
            return None
        else:
            raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def _redirect(self, redirected, request, spider, reason):

        reason = response_status_message(reason)
        redirects = request.meta.get('redirect_times', 0) + 1

        if redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                                               [request.url]
            redirected.meta['priority'] = redirected.meta['priority'] + self.priority_adjust
            self.logger.debug("Redirecting %s to %s from %s for %s times " % (
                reason, redirected.url, request.url, redirected.meta.get("redirect_times")))
            return redirected
        else:
            self.logger.info("Discarding %s: max redirections reached" % request.url)
            if request.meta.get("callback") == "parse":
                self.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])
                self.logger.error(
                    " in redicrect request error to failed pages url:%s, exception:%s, meta:%s" % (
                        request.url, reason, request.meta))

            raise IgnoreRequest("max redirections reached:%s" % reason)
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def _retry(self, request, reason, spider):
        spider.change_proxy = True
        retries = request.meta.get('retry_times', 0) + 1

        if request.meta.get("if_next_page"):
            self.logger.debug("in _retry re-yield next_pages request: %s, reason: %s. " % (request.url, reason))
            return request.copy()
        elif retries <= self.max_retry_times:
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.meta['priority'] = retryreq.meta['priority'] + self.crawler.settings.get(
                "REDIRECT_PRIORITY_ADJUST")
            self.logger.debug("in _retry retries times: %s, re-yield request: %s, reason: %s" % (
            retries, request.url, reason))
            return retryreq

        else:
            if request.meta.get("callback") == "parse":
                spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])
            self.logger.error(
                "retry request error to failed pages url:%s, exception:%s, meta:%s" % (
                    request.url, reason, request.meta))
            self.logger.info("Gave up retrying %s (failed %d times): %s" % (request.url, retries, reason))
            raise IgnoreRequest("%s %s" % (reason, "retry %s times. "%retries))
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def process_requset_method_wrapper(func):

    @wraps(func)
    def wrapper_method(*args, **kwds):
        self = args[0]
        request = kwds.get("request")
        spider = kwds.get("spider")
        try:
            return func(*args, **kwds)
        except Exception as e:
            spider.logger.error("error heppened in process_request method of %s in %s. Error:%s, processing %s," % (
            self.__class__.__name__, IP, traceback.format_exc(), request.url))
            spider.crawler.stats.set_failed_download(request.meta, str(e))
            raise IgnoreRequest(e)

    return wrapper_method
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def process_response_method_wrapper(func):

    @wraps(func)
    def wrapper_method(*args, **kwds):
        self = args[0]
        request = kwds.get("request")
        response = kwds.get("response")
        spider = kwds.get("spider")
        try:
            return func(*args, **kwds)
        except Exception as e:
            spider.logger.error("error heppened in process_response method of %s in %s. Error:%s, processing %s," % (
                self.__class__.__name__, IP, traceback.format_exc(), response.url))
            spider.crawler.stats.set_failed_download(request.meta, str(e))
            raise IgnoreRequest(e)

    return wrapper_method
项目:frontoxy    作者:fabienvauchelles    | 项目源码 | 文件源码
def process_response(self, request, response, spider):
        try:
            if response.status in self._http_status_codes:
                raise BlacklistError(response, u'HTTP status '.format(response.status))

            self._counter += 1
            if self._counter > self._counter_max:
                logger.debug(u'Max requests: Change IP')
                self._reset_session()

            return response

        except BlacklistError as ex:
            logger.debug(
                u'Ignoring Blacklisted response %(response)r: %(message)r',
                {'response': response, 'message': ex.message}, extra={'spider': spider},
            )

            self._reset_session()
            self.scheduler.process_exception(request, ex, spider)

            raise IgnoreRequest()
项目:retr    作者:aikipooh    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):
        if 'proxy' not in request.meta: return

        if isinstance(exception, IgnoreRequest): return # No problem

        mode=request.meta.get('proxy_mode', self.mode) # Possible override
        if mode == 'once': # Try once mode, quit here
            return

        # Simple downvote
        self.pp.set_status(self.map_proxy(request.meta['proxy']), None)
        del request.meta['proxy'] # Will pick new proxy on next request

        # List of conditions when we retry. Some of them may disable the proxy (TBD)
        if type(exception) in (
                ConnectionRefusedError, ConnectError, TimeoutError,
                TCPTimedOutError, NoRouteError, ResponseNeverReceived,
                ResponseFailed, TunnelError ):
            lg.error('{} on %s'.format(type(exception)), request.url)

            return request.replace(dont_filter = True)
项目:ahmia-crawler    作者:ahmia    | 项目源码 | 文件源码
def process_response(self, request, response, spider): # pylint:disable=unused-argument
        """
        Only allow HTTP response types that that match the given list of
        filtering regexs
        """
        # to specify on a per-spider basis
        # type_whitelist = getattr(spider, "response_type_whitelist", None)
        type_whitelist = (r'text', )
        content_type_header = response.headers.get('content-type', None)
        if content_type_header and self.is_valid_response(type_whitelist,
                                                          content_type_header):
            return response
        else:
            msg = "Ignoring request {}, content-type was not in whitelist" \
                  .format(response.url)
            logging.info(msg)
            raise IgnoreRequest()
项目:scrapy-amazon-robot-middleware    作者:ziplokk1    | 项目源码 | 文件源码
def process_response(self, request, response, spider):

        if request.meta.get('crack_retry_count', 0) > self.MAX_RETRY:
            raise IgnoreRequest('Max retries exceeded %s' % request.meta.get('original_request', request))

        if isinstance(response, HtmlResponse) and 'robot check' in ''.join([x.strip().lower() for x in response.xpath('//title/text()').extract()]):
            self.cracking = True
            self.crawler.stats.inc_value('robot_check')
            # Log the url of the original request that got blocked
            self.logger.warning('robot check {}'.format(request.meta.get('original_request') or request))
            return self.request_image(request, response)
        elif request.meta.get('image_request', False):
            self.logger.debug('processing image {}'.format(request))
            return self.process_image(request, response)
        else:
            self.cracking = False
            return response
项目:freshonions-torscraper    作者:dirtyfilthy    | 项目源码 | 文件源码
def process_request(self, request, spider):

        if not Domain.is_onion_url(request.url):
            return None
        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        subdomains = host.count(".")
        if subdomains > 2:
            raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)

        return None
项目:freshonions-torscraper    作者:dirtyfilthy    | 项目源码 | 文件源码
def process_request(self, request, spider):

        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        if self.counter[host] < self.max_pages:
            self.counter[host] += 1
            spider.logger.info('Page count is %d for %s' % (self.counter[host], host))
            return None                   
        else:
            raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
项目:lagouwang    作者:whaike    | 项目源码 | 文件源码
def process_response(self,request,response,spider):
        logging.info('response url %s with proxy:%s got status %s '%(response.url,request.meta['proxy'],response.status))
        if response.status != 200:
            if response.status == 301 or response.status == 404:
                Sup.letpagesgo(response.url)
                raise IgnoreRequest('found no pages')
            else:
                Sup.deleteProxy(request)
                new_request = request.copy()
                new_request.dont_filter = True
                return new_request
        else:
            return response

    #?????????IP
项目:scrapy_project    作者:zhanghe06    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if not request.url:
            return None
        url_hash = hashlib.md5(request.url.encode("utf8")).hexdigest()
        if self.redis_client.sismember(spider.name, url_hash):
            raise IgnoreRequest("Spider : %s, IgnoreRequest : %s" % (spider.name, request.url))
        else:
            self.redis_client.sadd(spider.name, url_hash)
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def _process_requests(self, items_or_requests, start=False):
        """Acquire the webdriver manager when it's available for requests."""
        error_msg = "WebdriverRequests from start_requests can't be in-page."
        for request in iter(items_or_requests):
            if isinstance(request, WebdriverRequest):
                if start and isinstance(request, WebdriverActionRequest):
                    raise IgnoreRequest(error_msg)
                request = self.manager.acquire(request)
                if request is WebdriverRequest.WAITING:
                    continue  # Request has been enqueued, so drop it.
            yield request
项目:scrapy_redis_spider    作者:lymlhhj123    | 项目源码 | 文件源码
def process_response(self, request, response, spider):
        #???????response
        http_code = response.status
        if http_code // 100 == 2:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            return response

        #???????304????3??????
        if http_code // 100 == 3 and http_code != 304:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            #??????url
            url = response.headers['location']
            domain = urlparse.urlparse(url).netloc
            #??????url?domain???allowed_domains?
            if domain in spider.allowed_domains:        
                return Request(url=url, meta=request.meta)
            else:
                raise IgnoreRequest(u'not allowed to crawl')

        if http_code // 100 == 4 and http_code != 403:
            self.stats.inc_value('response/%d'%http_code, spider=spider)
            #????403????????????
            raise IgnoreRequest(u'404')

        if http_code // 100 == 5:   
            self.stats.inc_value('response/%d'%http_code, spider=spider)                    
            return request

        #????meta refresh???        
        url = html.get_html_meta_refresh(response)
        if url:
            self.stats.inc_value('response/metarefresh', spider=spider)
            domain = urlparse.urlparse(url).netloc
            #??meta refresh????url?domain???allowed_domains?
            if domain in spider.allowed_domains:        
                return Request(url=url, meta=request.meta)
项目:scrapy_redis_spider    作者:lymlhhj123    | 项目源码 | 文件源码
def callback(self, result):
        if result:
            self.logger.info('%s has been cached', self.request.url)
            raise IgnoreRequest('%s has been cached'%self.request.url)
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):

        if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
                and not request.meta.get('dont_retry', False):
            return self._retry(request, "%s:%s" % (exception.__class__.__name__, exception), spider)

        else:
            if request.meta.get("callback") == "parse":
                spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid'])

            self.logger.error("in retry request error %s" % traceback.format_exc())
            raise IgnoreRequest("%s:%s unhandle error. " % (exception.__class__.__name__, exception))
项目:frontoxy    作者:fabienvauchelles    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):
        logger.debug(
                u'Ignoring Exception: %(message)r',
                {'message': exception.message}, extra={'spider': spider},
            )

        self._reset_session()
        self.scheduler.process_exception(request, exception, spider)
        raise IgnoreRequest()
项目:ahmia-crawler    作者:ahmia    | 项目源码 | 文件源码
def process_request(self, request, spider): # pylint:disable=unused-argument
        """Process incoming request."""
        parsed_uri = urlparse(request.url)
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        domain = domain.replace("http://", "").replace("https://", "") \
                                              .replace("/", "")
        banned_domains = settings.get('BANNED_DOMAINS')
        if hashlib.md5(domain).hexdigest() in banned_domains:
            # Do not execute this request
            request.meta['proxy'] = ""
            msg = "Ignoring request {}, This domain is banned." \
                  .format(request.url)
            logging.info(msg)
            raise IgnoreRequest()
项目:ahmia-crawler    作者:ahmia    | 项目源码 | 文件源码
def process_request(self, request, spider): # pylint:disable=unused-argument
        """Process incoming request."""
        hostname = urlparse(request.url).hostname
        if len(hostname.split(".")) > 4:
            # Do not execute this request
            request.meta['proxy'] = ""
            msg = "Ignoring request {}, too many sub domains." \
                  .format(request.url)
            logging.info(msg)
            raise IgnoreRequest()
项目:domain-discovery-crawler    作者:TeamHG-Memex    | 项目源码 | 文件源码
def _redirect(self, redirected, request, spider, reason):
        if self.domain_limit(spider) and \
                get_domain(redirected.url) != get_domain(request.url):
            raise IgnoreRequest('Redirecting off-domain')
        return super()._redirect(redirected, request, spider, reason)
项目:scrapy-qtwebkit    作者:ArturGaspar    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if request.meta.get('from_qtwebkit', False):
            ext = urlparse(request.url).path.rsplit('.', 1)[-1]
            if ext in {'css', 'gif', 'png'}:
                raise IgnoreRequest()
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if self.col.find_one({'$and': [
            {'host': spider.name},
            {'url': request.url},
            # {'download': {'$in': [0, 1, 2]}}
            {'download': {'$ne': -1}},
        ]}):
            logging.warning('the page is crawled, url is {0}'.format(request.url))
            raise IgnoreRequest()

        return None
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def process_request(self, request, spider):
        url = request.url.split('?')[0]
        if self.col.find_one({'$and': [
            {'host': spider.name},
            {'url': url},
            # {'download': {'$in': [0, 1, 2]}}
            {'download': {'$ne': -1}},
        ]}):
            logging.warning('the page is crawled, url is {0}'.format(request.url))
            raise IgnoreRequest()

        return None
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if 'http://v.youku.com/v_show/' in request.url:
            url = request.url.split('?')[0]
        else:
            url = request.url
        if self.col.find_one({'$and': [
            {'host': spider.name},
            {'url': url},
            # {'download': {'$in': [0, 1, 2]}}
            {'download': {'$ne': -1}},
        ]}):
            logging.warning('the page is crawled, url is {0}'.format(url))
            raise IgnoreRequest()

        return None
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if 'http://v.youku.com/v_show/' in request.url:
            url = request.url.split('?')[0]
        else:
            url = request.url
        if self.col.find_one({'$and': [
            {'host': spider.name},
            {'url': url},
            # {'download': {'$in': [0, 1, 2]}}
            {'download': {'$ne': -1}},
        ]}):
            logging.warning('the page is crawled, url is {0}'.format(url))
            raise IgnoreRequest()

        return None
项目:retr    作者:aikipooh    | 项目源码 | 文件源码
def process_request(self, request, spider):
        def set_auth(request, proxy):
            if proxy.creds:
                request.headers['Proxy-Authorization'] = proxy.creds

        lg.debug('in process_request: {}, {}'.format(request, request.meta))

        pa=request.meta.pop('proxy_action', None)
        if pa == 'disable':
            self.pp.set_status(self.map_proxy(request.meta['proxy']), 'D')
            del request.meta['proxy'] # Make it pick another proxy
        elif pa == 'release':
            proxy=self.map_proxy(request.meta['proxy'])
            self.pp.release_proxy(proxy)
            raise IgnoreRequest

        # Don't overwrite with a random one (server-side state for IP)
        if 'proxy' in request.meta:
            proxy=self.map_proxy(request.meta['proxy'])
            set_auth(request, proxy)
            return # No fuss, we have a proxy already

        if self.mode == 'random':
            proxy = self.pp.get_proxy(True)        
        elif self.mode == 'sequential':
            proxy = self.pp.get_proxy()

        request.meta['proxy'] = proxy.p
        set_auth(request, proxy)

        lg.debug('Using proxy '+proxy.p)

        # Start setup_session anew wherever we are, fresh or recurring
        req=request.meta.get('ss_request')
        if req:
            # Store original request to use after the session is setup
            if 'original_request' not in request.meta:
                request.meta['original_request']=request
        else:
            req=request

        return req.replace(meta=request.meta, dont_filter=True)