我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用scrapy.exceptions.IgnoreRequest()。
def process_request(self, request, spider): # don't use this middleware while testing is site is up if hasattr(spider, "test") and spider.test=="yes": #logger = logging.getLogger() #logger.info("Testing mode, dead domains disabled") return None if not Domain.is_onion_url(request.url): return None domain = Domain.find_by_url(request.url) if not domain or domain.is_up: return None raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) if not self.test_mode or not parsed_url.path in ["/", ""]: return None if not Domain.is_onion_url(request.url): return None d = Domain.find_by_url(request.url) if d is None: return None now = datetime.now() if now > d.next_scheduled_check: return None else: raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
def _redirect(self, redirected, request, spider, reason): reason = response_status_message(reason) redirects = request.meta.get('redirect_times', 0) + 1 if redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['priority'] = redirected.meta['priority'] + self.priority_adjust self.logger.debug("Redirecting %s to %s from %s for %s times " % ( reason, redirected.url, request.url, redirected.meta.get("redirect_times"))) return redirected else: self.logger.info("Discarding %s: max redirections reached" % request.url) if request.meta.get("callback") == "parse": self.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid']) self.logger.error( " in redicrect request error to failed pages url:%s, exception:%s, meta:%s" % ( request.url, reason, request.meta)) raise IgnoreRequest("max redirections reached:%s" % reason)
def _retry(self, request, reason, spider): spider.change_proxy = True retries = request.meta.get('retry_times', 0) + 1 if request.meta.get("if_next_page"): self.logger.debug("in _retry re-yield next_pages request: %s, reason: %s. " % (request.url, reason)) return request.copy() elif retries <= self.max_retry_times: retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.meta['priority'] = retryreq.meta['priority'] + self.crawler.settings.get( "REDIRECT_PRIORITY_ADJUST") self.logger.debug("in _retry retries times: %s, re-yield request: %s, reason: %s" % ( retries, request.url, reason)) return retryreq else: if request.meta.get("callback") == "parse": spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid']) self.logger.error( "retry request error to failed pages url:%s, exception:%s, meta:%s" % ( request.url, reason, request.meta)) self.logger.info("Gave up retrying %s (failed %d times): %s" % (request.url, retries, reason)) raise IgnoreRequest("%s %s" % (reason, "retry %s times. "%retries))
def process_requset_method_wrapper(func): @wraps(func) def wrapper_method(*args, **kwds): self = args[0] request = kwds.get("request") spider = kwds.get("spider") try: return func(*args, **kwds) except Exception as e: spider.logger.error("error heppened in process_request method of %s in %s. Error:%s, processing %s," % ( self.__class__.__name__, IP, traceback.format_exc(), request.url)) spider.crawler.stats.set_failed_download(request.meta, str(e)) raise IgnoreRequest(e) return wrapper_method
def process_response_method_wrapper(func): @wraps(func) def wrapper_method(*args, **kwds): self = args[0] request = kwds.get("request") response = kwds.get("response") spider = kwds.get("spider") try: return func(*args, **kwds) except Exception as e: spider.logger.error("error heppened in process_response method of %s in %s. Error:%s, processing %s," % ( self.__class__.__name__, IP, traceback.format_exc(), response.url)) spider.crawler.stats.set_failed_download(request.meta, str(e)) raise IgnoreRequest(e) return wrapper_method
def process_response(self, request, response, spider): try: if response.status in self._http_status_codes: raise BlacklistError(response, u'HTTP status '.format(response.status)) self._counter += 1 if self._counter > self._counter_max: logger.debug(u'Max requests: Change IP') self._reset_session() return response except BlacklistError as ex: logger.debug( u'Ignoring Blacklisted response %(response)r: %(message)r', {'response': response, 'message': ex.message}, extra={'spider': spider}, ) self._reset_session() self.scheduler.process_exception(request, ex, spider) raise IgnoreRequest()
def process_exception(self, request, exception, spider): if 'proxy' not in request.meta: return if isinstance(exception, IgnoreRequest): return # No problem mode=request.meta.get('proxy_mode', self.mode) # Possible override if mode == 'once': # Try once mode, quit here return # Simple downvote self.pp.set_status(self.map_proxy(request.meta['proxy']), None) del request.meta['proxy'] # Will pick new proxy on next request # List of conditions when we retry. Some of them may disable the proxy (TBD) if type(exception) in ( ConnectionRefusedError, ConnectError, TimeoutError, TCPTimedOutError, NoRouteError, ResponseNeverReceived, ResponseFailed, TunnelError ): lg.error('{} on %s'.format(type(exception)), request.url) return request.replace(dont_filter = True)
def process_response(self, request, response, spider): # pylint:disable=unused-argument """ Only allow HTTP response types that that match the given list of filtering regexs """ # to specify on a per-spider basis # type_whitelist = getattr(spider, "response_type_whitelist", None) type_whitelist = (r'text', ) content_type_header = response.headers.get('content-type', None) if content_type_header and self.is_valid_response(type_whitelist, content_type_header): return response else: msg = "Ignoring request {}, content-type was not in whitelist" \ .format(response.url) logging.info(msg) raise IgnoreRequest()
def process_response(self, request, response, spider): if request.meta.get('crack_retry_count', 0) > self.MAX_RETRY: raise IgnoreRequest('Max retries exceeded %s' % request.meta.get('original_request', request)) if isinstance(response, HtmlResponse) and 'robot check' in ''.join([x.strip().lower() for x in response.xpath('//title/text()').extract()]): self.cracking = True self.crawler.stats.inc_value('robot_check') # Log the url of the original request that got blocked self.logger.warning('robot check {}'.format(request.meta.get('original_request') or request)) return self.request_image(request, response) elif request.meta.get('image_request', False): self.logger.debug('processing image {}'.format(request)) return self.process_image(request, response) else: self.cracking = False return response
def process_request(self, request, spider): if not Domain.is_onion_url(request.url): return None parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname subdomains = host.count(".") if subdomains > 2: raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains) return None
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname if self.counter[host] < self.max_pages: self.counter[host] += 1 spider.logger.info('Page count is %d for %s' % (self.counter[host], host)) return None else: raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
def process_response(self,request,response,spider): logging.info('response url %s with proxy:%s got status %s '%(response.url,request.meta['proxy'],response.status)) if response.status != 200: if response.status == 301 or response.status == 404: Sup.letpagesgo(response.url) raise IgnoreRequest('found no pages') else: Sup.deleteProxy(request) new_request = request.copy() new_request.dont_filter = True return new_request else: return response #?????????IP
def process_request(self, request, spider): if not request.url: return None url_hash = hashlib.md5(request.url.encode("utf8")).hexdigest() if self.redis_client.sismember(spider.name, url_hash): raise IgnoreRequest("Spider : %s, IgnoreRequest : %s" % (spider.name, request.url)) else: self.redis_client.sadd(spider.name, url_hash)
def _process_requests(self, items_or_requests, start=False): """Acquire the webdriver manager when it's available for requests.""" error_msg = "WebdriverRequests from start_requests can't be in-page." for request in iter(items_or_requests): if isinstance(request, WebdriverRequest): if start and isinstance(request, WebdriverActionRequest): raise IgnoreRequest(error_msg) request = self.manager.acquire(request) if request is WebdriverRequest.WAITING: continue # Request has been enqueued, so drop it. yield request
def process_response(self, request, response, spider): #???????response http_code = response.status if http_code // 100 == 2: self.stats.inc_value('response/%d'%http_code, spider=spider) return response #???????304????3?????? if http_code // 100 == 3 and http_code != 304: self.stats.inc_value('response/%d'%http_code, spider=spider) #??????url url = response.headers['location'] domain = urlparse.urlparse(url).netloc #??????url?domain???allowed_domains? if domain in spider.allowed_domains: return Request(url=url, meta=request.meta) else: raise IgnoreRequest(u'not allowed to crawl') if http_code // 100 == 4 and http_code != 403: self.stats.inc_value('response/%d'%http_code, spider=spider) #????403???????????? raise IgnoreRequest(u'404') if http_code // 100 == 5: self.stats.inc_value('response/%d'%http_code, spider=spider) return request #????meta refresh??? url = html.get_html_meta_refresh(response) if url: self.stats.inc_value('response/metarefresh', spider=spider) domain = urlparse.urlparse(url).netloc #??meta refresh????url?domain???allowed_domains? if domain in spider.allowed_domains: return Request(url=url, meta=request.meta)
def callback(self, result): if result: self.logger.info('%s has been cached', self.request.url) raise IgnoreRequest('%s has been cached'%self.request.url)
def process_exception(self, request, exception, spider): if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \ and not request.meta.get('dont_retry', False): return self._retry(request, "%s:%s" % (exception.__class__.__name__, exception), spider) else: if request.meta.get("callback") == "parse": spider.crawler.stats.inc_total_pages(crawlid=request.meta['crawlid']) self.logger.error("in retry request error %s" % traceback.format_exc()) raise IgnoreRequest("%s:%s unhandle error. " % (exception.__class__.__name__, exception))
def process_exception(self, request, exception, spider): logger.debug( u'Ignoring Exception: %(message)r', {'message': exception.message}, extra={'spider': spider}, ) self._reset_session() self.scheduler.process_exception(request, exception, spider) raise IgnoreRequest()
def process_request(self, request, spider): # pylint:disable=unused-argument """Process incoming request.""" parsed_uri = urlparse(request.url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) domain = domain.replace("http://", "").replace("https://", "") \ .replace("/", "") banned_domains = settings.get('BANNED_DOMAINS') if hashlib.md5(domain).hexdigest() in banned_domains: # Do not execute this request request.meta['proxy'] = "" msg = "Ignoring request {}, This domain is banned." \ .format(request.url) logging.info(msg) raise IgnoreRequest()
def process_request(self, request, spider): # pylint:disable=unused-argument """Process incoming request.""" hostname = urlparse(request.url).hostname if len(hostname.split(".")) > 4: # Do not execute this request request.meta['proxy'] = "" msg = "Ignoring request {}, too many sub domains." \ .format(request.url) logging.info(msg) raise IgnoreRequest()
def _redirect(self, redirected, request, spider, reason): if self.domain_limit(spider) and \ get_domain(redirected.url) != get_domain(request.url): raise IgnoreRequest('Redirecting off-domain') return super()._redirect(redirected, request, spider, reason)
def process_request(self, request, spider): if request.meta.get('from_qtwebkit', False): ext = urlparse(request.url).path.rsplit('.', 1)[-1] if ext in {'css', 'gif', 'png'}: raise IgnoreRequest()
def process_request(self, request, spider): if self.col.find_one({'$and': [ {'host': spider.name}, {'url': request.url}, # {'download': {'$in': [0, 1, 2]}} {'download': {'$ne': -1}}, ]}): logging.warning('the page is crawled, url is {0}'.format(request.url)) raise IgnoreRequest() return None
def process_request(self, request, spider): url = request.url.split('?')[0] if self.col.find_one({'$and': [ {'host': spider.name}, {'url': url}, # {'download': {'$in': [0, 1, 2]}} {'download': {'$ne': -1}}, ]}): logging.warning('the page is crawled, url is {0}'.format(request.url)) raise IgnoreRequest() return None
def process_request(self, request, spider): if 'http://v.youku.com/v_show/' in request.url: url = request.url.split('?')[0] else: url = request.url if self.col.find_one({'$and': [ {'host': spider.name}, {'url': url}, # {'download': {'$in': [0, 1, 2]}} {'download': {'$ne': -1}}, ]}): logging.warning('the page is crawled, url is {0}'.format(url)) raise IgnoreRequest() return None
def process_request(self, request, spider): def set_auth(request, proxy): if proxy.creds: request.headers['Proxy-Authorization'] = proxy.creds lg.debug('in process_request: {}, {}'.format(request, request.meta)) pa=request.meta.pop('proxy_action', None) if pa == 'disable': self.pp.set_status(self.map_proxy(request.meta['proxy']), 'D') del request.meta['proxy'] # Make it pick another proxy elif pa == 'release': proxy=self.map_proxy(request.meta['proxy']) self.pp.release_proxy(proxy) raise IgnoreRequest # Don't overwrite with a random one (server-side state for IP) if 'proxy' in request.meta: proxy=self.map_proxy(request.meta['proxy']) set_auth(request, proxy) return # No fuss, we have a proxy already if self.mode == 'random': proxy = self.pp.get_proxy(True) elif self.mode == 'sequential': proxy = self.pp.get_proxy() request.meta['proxy'] = proxy.p set_auth(request, proxy) lg.debug('Using proxy '+proxy.p) # Start setup_session anew wherever we are, fresh or recurring req=request.meta.get('ss_request') if req: # Store original request to use after the session is setup if 'original_request' not in request.meta: request.meta['original_request']=request else: req=request return req.replace(meta=request.meta, dont_filter=True)