Python scrapy.exceptions 模块,CloseSpider() 实例源码

我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用scrapy.exceptions.CloseSpider()

项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #??????????????????????
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #??????
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"????"
        yield item
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_article(self,response):
        #content,news_no,crawl_date
        item = response.meta.get("item",NewsItem())
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
        #     delta = self.end_now-struct_date
        #     print delta.days
        #     if delta.days == self.end_day:
        #         raise CloseSpider('today scrapy end')
        soup =BeautifulSoup(response.body)
        author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
        abstract =  soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
        content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
        news_no = response.url.split("/")[-1][:-5]
        item["author"] = author
        item["abstract"] = abstract
        item["content"] = content
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        yield item
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def parse_search(self, response):
        """
        @summary: ?????????????request???????
        @param response:start_requests()?????????????
        """
        # ???????????????????????"antispider"??
        # ????"antispider"???????????????????????????
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200) # ??????????????
            raise CloseSpider('antispider')
        # ext????????????????json????url???????????????
        ext = response.xpath(
            '//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext??
        if not ext:
            spider_logger.error("Faild searching {0} !".format(response.meta['query']))
            return
        # ???????json???url?????????10?????????????1?(page=1????)?url
        json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord='
        cookies = response.meta['cookies']
        yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def spider_opened(self, spider):
        try:
            file = open(spider.settings.get('FEED_FILE'), 'wb')
        except TypeError:
            raise NotConfigured('FEED_FILE parameter does not string or does not exist')
        except (IOError, OSError) as e:
            raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e))
        self.files[spider] = file
        feed_title = spider.settings.get('FEED_TITLE')
        if not feed_title:
            raise NotConfigured('FEED_TITLE parameter does not exist')
        feed_link = spider.settings.get('FEED_LINK')
        if not feed_link:
            raise NotConfigured('FEED_LINK parameter does not exist')
        feed_description = spider.settings.get('FEED_DESCRIPTION')
        if feed_description is None:
            raise NotConfigured('FEED_DESCRIPTION parameter does not exist')
        feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter)
        if isinstance(feed_exporter, six.string_types):
            feed_exporter = load_object(feed_exporter)
        if not issubclass(feed_exporter, RssItemExporter):
            raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter))
        self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description)
        self.exporters[spider].start_exporting()
项目:YelpCrawlSpider    作者:yjp999    | 项目源码 | 文件源码
def parse(self, response):
        if response.status ==503:
            raise CloseSpider("denied by remote server")
        sel = Selector(response)
        appends = response.meta['appends']
        cityname = appends['city']
        smexp = appends['cat']
        xpath_exp = '//a[text()="Search for more '+smexp+'"]/@href'
        if cityname=='??':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Hong+Kong', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=???%2C+Hong+Kong']
        elif cityname=='Adelaide':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide%2C+Adelaide+South+Australia%2C+Australia', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide+South+Australia+5000']
        elif cityname=='Park La Brea':
            moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=South+La+Brea+Avenue%2C+Los+Angeles%2C+CA+90056', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Mid-Wilshire%2C+Los+Angeles%2C+CA', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=North+La+Brea+Avenue%2C+Los+Angeles%2C+CA']
        else:
            searchmore = sel.xpath(xpath_exp).extract()[0]
            moreLink = [response.urljoin(searchmore)]

        for link in moreLink:
            yield Request(url=link, callback=self.parseBegin, meta={'appends': appends}, dont_filter=True)
项目:scrapy-rotating-proxies    作者:TeamHG-Memex    | 项目源码 | 文件源码
def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warn("No proxies available; marking all proxies "
                            "as unchecked")
                self.proxies.reset()
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #?????????
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #????
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #?????
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url)
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse(self, response):
        #????
        html = response.body
        soup = BeautifulSoup(html,"lxml")
        #????????
        for i in self.fetch_newslist(soup):
            # raise CloseSpider(str(i['time'] == u"???"))
            # if i['time'] == "???": raise CloseSpider("today news end")
            request = scrapy.Request(i['news_url'],callback=self.parse_news)
            request.meta['item'] = i
            request.meta['pageindex'] = 1
            yield request

        #????????
        lasttime = "nothing"
        for i in  soup.select('div[class="news_li"]'):
            if i.attrs.has_key("lasttime"):
                lasttime =  i["lasttime"]
                break
        #?????url????
        # ???load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
        load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
        page = 2
        if load_chosen :
            tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
            yield scrapy.Request(tp_url, callback=self.next_page_parse)
项目:EasyGoSpider    作者:Karmenzind    | 项目源码 | 文件源码
def start_requests(self):
        # while len(self.finished) < len(self.all_urls):
        current_hour = time.strftime("%Y%m%d%H", time.localtime())
        if current_hour != START_HOUR:
            self.logger.info("It's already %s. Stopping..." % current_hour)
            return
        for url, item_idx in self.all_urls.iteritems():
            if not self.cookies:
                raise CloseSpider("No enough cookies.")
            if item_idx in self.finished:
                continue
            else:
                yield Request(url, callback=self.parse_item)
                # self.logger.info(u'Crawled %s / %s. Done :)' % (len(self.finished), len(self.all_urls)))
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def process_response(request, response, spider):
        if "antispider" in response.url:
            spider_logger.error("recieve verification code in %s" % response.url) 
            raise CloseSpider('antispider')
        return response
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def __init__(self,
                 query=None,
                 start_time=None,
                 end_time=None,
                 index_pages=None):
        """
        @summary: ?????????, ?????????????
        @param query: ???,???????
        @param start_time: ????????start_time???????????????????????
        @param end_time: ????????end_time?????
        @param index_pages: ?????????????
        """
        # ??????????????????????????
        if query:
            self.query = query # self.query????????????
        else:
            # ???????????????????????
            spider_logger.error("Spider need single search word each time!Check input!")
            raise CloseSpider('invaild search word')
        # ???????????????100??
        if start_time:
            self.from_time = start_time
        else:
            self.from_time = datetime.now()-timedelta(days=100)  # ????100??
        # ?????????????
        if end_time:
            self.end_time = end_time
        else:
            self.end_time = datetime.now()  # ????????
        # ???????
        if index_pages:
            self.index_pages = int(index_pages)
        else:
            self.index_pages = 10 # ????10?
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def parse_index(self, response):
        """
        @summary: ?????????????????Request??
        @param response: parse_search()?????????????
        @return: list????????????url???????????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        requests = []
        page_list = self._get_result(response)
        # ???????????????
        if not page_list:
            return requests
        next_page = True  # ????????
        # ???????????????
        for item in page_list:
            if isinstance(item, Request):  # ?????Request
                requests.append(item)
                next_page = False
                break
            if item['publish_time'] <= self.from_time:  # ????????self.from_time
                next_page = False
                break
            elif item['publish_time'] > self.end_time:  # ????????self.end_time
                continue
            else:
                req = Request(item['url'], self.parse_page)
                # ???????
                req.meta["item"] = item
                requests.append(req)
        # ?????,??????Request;???????
        if next_page and self._next_result_page(response):
            cookies = response.meta['cookies']
            requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies}))
        return requests
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def parse_page(self, response):
        """
        @summary: ??????
        @param response: parse_index()?????????????
        @return: ?????_finish_item()??????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        item = response.meta["item"]
        return self._finish_item(item, response)
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def process_eastmoney_gubalist_item(self, item, spider):
        status = item.get('status')
        if status is not None and status != 200:
            self.error_count += 1
            if self.error_count * 5 > self.success_count:
                raise CloseSpider(
                    'too many error occurred, shutdown gracefully.')
            return item

        if 'ticker_id' not in item or item['ticker_id'] == "":
            raise DropItem('??ticker_id')
        self.write_to_file(item, spider.name)
        pass
项目:scrapy_rss    作者:woxcab    | 项目源码 | 文件源码
def test_empty_feed(self):
        for partial_settings in itertools.chain.from_iterable(
                itertools.combinations(self.feed_settings.items(), r)
                for r in range(1, len(self.feed_settings))):
            partial_settings = dict(partial_settings)
            undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)]
            with self.assertRaisesRegexp(NotConfigured,
                                         '({})'.format('|'.join(undefined_settings))
                                            if len(undefined_settings) > 1 else undefined_settings[0],
                                         msg='The feed file, title, link and description must be specified, but the absence of {} is allowed'
                                             .format(undefined_settings)):
                with CrawlerContext(**partial_settings):
                    pass

        with self.assertRaises(CloseSpider):
            feed_settings = dict(self.feed_settings)
            feed_settings['feed_file'] = 'non/existent/filepath'
            with CrawlerContext(**feed_settings):
                pass

        with CrawlerContext(**self.feed_settings):
            pass

        with open(self.feed_settings['feed_file']) as data, \
             open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected:
            self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def close_spider(self, reason):
        raise CloseSpider(reason=reason)

    # do something before spider close
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False
        published_at_wib = ''

        try:
            # Get list of news from the current page
            articles = json.loads(response.text)

            for article in articles['contents']:
                url = article['friendlyURL']
                date = article['publishTime']
                published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                published_at = wib_to_utc(published_at_wib)

                if self.media['last_scraped_at'] >= published_at:
                    is_no_updated = True
                    break

                yield Request('http://pilkada.arah.com' + url, callback=self.parse_news)
        except:
            raise CloseSpider('article not found')

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Get more
        try:
            next_date = published_at_wib - timedelta(seconds=1)

            if self.media['last_scraped_at'] < wib_to_utc(next_date):
                yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19],
                        callback=self.parse)
        except:
            pass

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        has_no_update = False

        # Get list of news from the current page
        for article in response.css('.col-sm-16 > .row > .col-sm-16 > .row'):
            title = article.css('h4::text').extract_first()
            url = article.css('a::attr(href)').extract_first()            
            time = article.css('.indexTime::text').extract_first() # 16:51

            date = article.css('.indexDay::text').extract_first() # Sabtu, 15 Oktober 2016
            date = date.split(',')[-1].strip() # 15 Oktober 2016

            date_time = date + ' ' + time # 15 Oktober 2016 16:51
            date_time = date_time.split(' ')
            date_time = ' '.join([_(s) for s in date_time]) # Oktober => October

            # Parse date information
            try:
                published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                has_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if has_no_update:
            self.logger.info('Media have no update')
            return

        # Currently has no more pages
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse_news_pilkada(self, loader, response):
        date_selector = response.css('.block-judul-artikel > .tanggal::text')
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)

        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.block-judul-artikel > .judul-artikel')
        loader.add_value('title', title_selector.extract()[0])

        raw_content_selector = response.css('.block-artikel .p-artikel')
        raw_content_selector = raw_content_selector.xpath('//p[not(iframe)]')
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()
        loader.add_value('raw_content', raw_content)

        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        loader.add_value('url', response.url)
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        # Note: no next page button on cnnindonesia, all is loaded here
        article_selectors = response.css('a.list_kontribusi');
        if not article_selectors:
            raise CloseSpider('article_selectors not found')

        for article in article_selectors:
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: Jumat, 23/09/2016 21:17
            info_selectors = article.css('div.text > div > span.tanggal::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[0]
            info_time = info.split(',')[1].strip()

            # Parse date information
            try:
                # Example: 23/09/2016 21:17
                published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M')
            except ValueError as err:
                raise CloseSpider('cannot_parse_date: {}'.format(err))
            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        articles = json.loads(response.body)['response']
        for article in articles:
            # Example: 2016-10-12 15:16:04
            date_time_str = article['news_date_publish']

            # Parse date information
            try:
                published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))
            published_at = wib_to_utc(published_at_wib)

            if (self.media['last_scraped_at'] >= published_at):
                is_no_update = True
                break;

            for sub_article in article['news_content']:
                yield self.parse_news(article, sub_article)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        if len(articles) > 0:
            # Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/'
            next_page_url = response.url.split('/')
            next_page_url[-4] = str(int(next_page_url[-4]) + 20)
            next_page_url = '/'.join(next_page_url)
            yield Request(next_page_url, callback=self.parse)

    # Collect news item
项目:frontoxy    作者:fabienvauchelles    | 项目源码 | 文件源码
def check_error(self):
        # Stop spider if error has been raised in pipeline
        if hasattr(self, 'close_error'):
            raise CloseSpider(self.close_error)
项目:aliexpress    作者:yangxue088    | 项目源码 | 文件源码
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and FeedbackSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
项目:aliexpress    作者:yangxue088    | 项目源码 | 文件源码
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
项目:aliexpress    作者:yangxue088    | 项目源码 | 文件源码
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
项目:aliexpress    作者:yangxue088    | 项目源码 | 文件源码
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
项目:aliexpress    作者:yangxue088    | 项目源码 | 文件源码
def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')
项目:gaokao    作者:EasyData    | 项目源码 | 文件源码
def parse(self, response):

        data = json.loads(response.body)
        total = int(data['totalRecord']['num'])
        total_page = int(math.ceil(total/float(self.page_size)))

        if total == 0:
            raise CloseSpider('blocked')

        for i in self.parse_item(response):
            yield i

        for page in range(2, total_page+1):
            yield Request(url=self.get_url(page), callback=self.parse_item)
项目:ws-backend-community    作者:lavalamp-    | 项目源码 | 文件源码
def __check_for_close(self):
        """
        Check to see if this spider has been running for longer than the maximum amount
        of allowed time, and stop the spider if it has.
        :return: None
        """
        if self._start_time is None:
            self._start_time = DatetimeHelper.now()
        elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
        if elapsed_time > self.max_run_time:
            raise CloseSpider(
                "Spider run time exceeded maximum time of %s seconds. Closing."
                % (self.max_run_time,)
            )
项目:livetv_mining    作者:taogeT    | 项目源码 | 文件源码
def open_spider(self, spider):
        site_setting = spider.settings.get('SITE')
        if not site_setting:
            error_msg = 'Can not find the website configuration from settings.'
            spider.logger.error(error_msg)
            raise CloseSpider(error_msg)
        self.session = self.session_maker()
        site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
        if not site:
            site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
                              description=site_setting['description'], url=site_setting['url'],
                              image=site_setting['image'], show_seq=site_setting['show_seq'])
            self.session.add(site)
            self.session.commit()
        self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        # print response.url,"response"
        PageKey = response.meta.get("topic_id")
        PageNumber =response.meta.get("PageNumber")
        flag_id =str(int(PageKey)-40037910)
        soup =BeautifulSoup(response.body,"lxml")
        #2016-07-13
        news_date = soup.find("time").text if soup.find("time") else None
        # print self.flag[flag_id],int(PageNumber)
        """
        ?????????self.flag[flag_id]??0??????????????
        ??????????????????????????????
        self.flag[flag_id]=????
        """
        if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
            #???????


            struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
            # print self.end_now,struct_date,"time"
            delta = self.end_now-struct_date
            # print delta.days,"delta day ~~~~~~~~~~~~~~~~"
            if delta.days > self.end_day:
                self.flag[str(flag_id)]=int(PageNumber)
                # print flag_id,"stop ~~~~~~"
                # raise CloseSpider('today scrapy end')
            else:

                head = soup.find("div",class_="post-head")
                topic,title,abstract=None,None,None
                if head:
                    topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
                    title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
                    abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
                content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
                news_no = response.url.split("/")[-1].split("?")[0]
                #TODO ????js??????
                item = NewsItem(title=title,topic=topic,
                                abstract=abstract,news_date=news_date,
                                content=content,news_no=news_no
                                ,crawl_date=NOW,news_url=response.url,catalogue='????')
                yield item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        articles_grid = response.css('li:not(.last) > div.grid')
        articles = zip(articles_grid, [NEWS_GRID] * len(articles_grid))
        articles += zip(response.css('div.topic'), [NEWS_HEADLINE])

        if not articles:
            raise CloseSpider('article not found')

        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = None
            if article[1] == NEWS_GRID:
                url_selectors = article[0].css('h2 > a::attr(href)')
            elif article[1] == NEWS_HEADLINE:
                url_selectors = article[0].css('h1 > a::attr(href)')

            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            self.logger.info('Url: {}'.format(url))

            # Example: Minggu, 09 Oct 2016 15:14
            info_selectors = article[0].css('div.reg::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[1]
            # Example: 09 Oct 2016 15:14
            info_time = info.split(',')[1].strip()

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        if response.css('div.bu.fr > a'):
            next_page = response.css('div.bu.fr > a[rel="next"]::attr(href)').extract()[0]
            next_page_url = response.urljoin(next_page)
            yield Request(next_page_url, callback=self.parse)

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        # Collect list of news from current page
        article_selectors = response.css('ul.indexlist > li')
        if not article_selectors:
            raise CloseSpider('article_selectors not found')
        for article in article_selectors:
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: 7 Oktober 2016 19:37
            info_selectors = article.css('div.upperdeck::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info = info_selectors.extract()[1]
            info = info.split(',')[1].replace('\t','').strip()
            # Example: 7 October 2016 19:37
            info_time = info.split(' ')
            info_time = ' '.join([_(s) for s in info_time])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M')
            except ValueError as err:
                raise CloseSpider('cannot_parse_date: {}'.format(err))
            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break
            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # Collect news on next page
        tag_selectors = response.css('div.pagination > a')
        if not tag_selectors:
            raise CloseSpider('tag_selectors not found')
        for tag in tag_selectors:
            more_selectors = tag.css('a::text')
            if not more_selectors:
                raise CloseSpider('more_selectors not found')
            more = more_selectors.extract()[0]
            if more == 'NEXT':
                next_page = tag.css('a::attr(href)').extract()[0]
                next_page_url = response.urljoin(next_page)
                yield Request(next_page_url, callback=self.parse)

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.view-front > div.view-content > div.views-row')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('span.field-content a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example '19 Oct 2016'
            info_selectors = article.css('span.field-content::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            info_time = info_selectors.extract()[1].strip()

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time, '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request('http://www.qureta.com' + url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        if response.css('li.next'):
            next_page_url = response.css('li.next > a::attr(href)')[0].extract()
            yield Request('http://www.qureta.com' + next_page_url, callback=self.parse)

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('li.media')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example '02 November 2016'
            date_selectors = article.css('time::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: 02 Nov 2016
                date[1] = sanitize(date[1])
                published_at_wib = datetime.strptime(' '.join(date),
                    '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request('http:' + url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.xpath(
                    '//section[@class="pagination-numeric"]/span/a/@href')[-1].extract()

            if next_page_url and next_page_url != response.url:
                yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse_news_metro(self, response):
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        date_selector = response.css('.artikel > div.block-tanggal::text')
        if not date_selector:
            return self.parse_news_pilkada(loader, response)
        try:
            date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
        except Exception:
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        if (self.media['last_scraped_at'] >= published_at):
            is_no_update = True
            self.logger.info('Media have no update')
            raise CloseSpider('finished')
        loader.add_value('published_at', published_at)

        title_selector = response.css('.artikel > h1::text')
        if not title_selector:
            return loader.load_item()
        loader.add_value('title', title_selector.extract()[0])

        # Select all p which don't have iframe inside it
        raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
        if not raw_content_selector:
            return loader.load_item()
        raw_content = ''
        for rsl in raw_content_selector:
            raw_content = raw_content + rsl.extract().strip()

        # Go to next page while there is next page button
        next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
        if next_page_selector:
            return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))

        loader.add_value('raw_content', raw_content)

        # The author usually put inside <strong> tag, however, some news is not using <strong> tag.
        # NOTE: this block of code may need revision in the future
        author_name = ''
        for author_name_selector in reversed(raw_content_selector):
            author_name_selector = author_name_selector.css('strong::text')
            for tmp in reversed(author_name_selector.extract()):
                tmp = tmp.strip()
                if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
                    author_name = tmp
                    break
            if author_name:
                break
        author_name = ','.join(author_name.split(' | '))
        loader.add_value('author_name', author_name)
        return loader.load_item()
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: {}'.format(response))
        is_no_update = False

        for article in response.css('li > div.breaking-title'):
            # http://metro.sindonews.com/read/1146316/171/penyidik-bareskrim-mulai-dalami-video-dugaan-penistaan-agama-1476179831
            url_selectors = article.css('a::attr(href)')

            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Kamis, 13 Oktober 2016 - 11:18 WIB'
            date_time_str_selectors = article.css('p::text')

            if not date_time_str_selectors:
                raise CloseSpider('date_time_str_selectors not found')

            date_time_str = date_time_str_selectors.extract()[0]

            # Parse date information
            # Example '13 Oktober 2016 - 11:18'
            date_time_str = date_time_str.split(',')[1].strip()[:-4]
            date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
            try:
                published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        for next_button in response.css('.mpaging > ul > li'):
            if len(next_button.css('a:not(.active) > .fa-angle-right')) > 0:
              next_page = next_button.css('a::attr(href)').extract()[0]
              next_page_url = response.urljoin(next_page)
              yield Request(next_page_url, callback=self.parse)
              break

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('article')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example: 'Monday, 24/11/2016 | 13:54'
            date_selectors = article.css('time::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.css('div.pagination > a.next::attr(href)').extract()[0]

            if next_page_url:
                yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.wp-terhangat > div.item3')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Wednesday, 02 November 2016'
            date_selectors = article.css('span.date::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: Wednesday, 02 Nov 2016
                date[2] = sanitize(date[2])
                published_at_wib = datetime.strptime(' '.join(date[1:]),
                    '%d %b %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            # if it's news from before 2015, drop them
            if self.media['last_scraped_at'] >= published_at or int(date[-1]) < 2015:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        try:
            next_page_url = response.css('nav > ul > li > a::attr(href)').extract()[-1]

            if next_page_url:
               yield Request(next_page_url, callback=self.parse)
        except:
            pass

    # Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        is_no_update = False

        news_selector = response.css("ul.clearfix > li > div.tleft")
        if not news_selector:
            raise CloseSpider('news_selectors not found')
        for news in news_selector:
            url_selectors = news.css("div.tleft > h3 > a::attr(href)")
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            # http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki
            # http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1
            url = url_selectors.extract()[0]
            url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1'

            date_selectors = news.css("div.grey.small::text")
            if not date_selectors:
                raise CloseSpider('date_selectors not found')
            raw_date = date_selectors.extract()[0]

            # Parse date information
            try:
                published_at = self.convert_date(raw_date);
            except Exception as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url=url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # For kompas case, we don't rely on the pagination
        # Their pagination is max 17 pages, the truth is they have 25 pages
        if self.first_time:
            template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017'
            for i in xrange(25):
                page = i + 1
                next_url = template_url.format(page)
                yield Request(next_url, callback=self.parse)
            self.first_time = False
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('div.article-snippet__info')
        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            info_selectors = article.css('div.article-snippet__date')
            info_selectors = info_selectors.css('.timeago::text')
            if not info_selectors:
                raise CloseSpider('info_selectors not found')
            # Example '13 Okt 2016 16:10'
            info_time = info_selectors.extract()[0]
            # Example '13 Oct 2016 16:10'
            info_time = ' '.join([_(w) for w in info_time.split(' ')])

            # Parse date information
            try:
                published_at_wib = datetime.strptime(info_time,
                    '%d %b %Y %H:%M')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: {}'.format(e))

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy Request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

    # TODO: Collect news item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def parse(self, response):
        self.logger.info('parse: %s' % response)
        is_no_update = False

        # Get list of news from the current page
        articles = response.css('article > div > div.post-content')

        if not articles:
            raise CloseSpider('article not found')
        for article in articles:
            # Close the spider if we don't find the list of urls
            url_selectors = article.css('a.timestamp-link::attr(href)')
            if not url_selectors:
                raise CloseSpider('url_selectors not found')
            url = url_selectors.extract()[0]

            # Example 'Sabtu, November 19, 2016'
            date_selectors = article.css('a.timestamp-link > abbr::text')
            if not date_selectors:
                raise CloseSpider('date_selectors not found')

            # Parse date information
            try:
                date = date_selectors.extract()[0].split(' ')
                # Sanitize month - Indo month to Eng month
                # Example: Nov 19 2016
                date[1] = sanitize(date[1])
                published_at_wib = datetime.strptime(' '.join(date[1:]),
                    '%b %d, %Y')
            except ValueError as e:
                raise CloseSpider('cannot_parse_date: %s' % e)

            published_at = wib_to_utc(published_at_wib)

            if self.media['last_scraped_at'] >= published_at:
                is_no_update = True
                break

            # For each url we create new scrapy request
            yield Request(url, callback=self.parse_news)

        if is_no_update:
            self.logger.info('Media have no update')
            return

        # try getting next page
        if len(articles) > 0:
            try:
                yield Request('http://www.nusanews.co/search/label/Pilkada?updated-max=' +
                        str(published_at_wib).replace(' ','T') + '%2B07:00&max-results=20', callback=self.parse)
            except Exception as e:
                pass

    # Collect news item
项目:YelpCrawlSpider    作者:yjp999    | 项目源码 | 文件源码
def parseBegin(self, response):
        if response.status ==503:
            raise CloseSpider("denied by remote server")
        sel = Selector(response)
        appends = response.meta['appends']
        cityName = appends['city']
        category = appends['cat']

        locations = self.getLocations(response.body)

        if locations == []:
            # self.logger.error("location is []: %s\t%s", response.url, str(cityName))
            return


        div_a = sel.xpath('//li[@class="regular-search-result"]/div/div[@class="biz-listing-large"]')
        for ii, div in enumerate(div_a):
            # pdb.set_trace()
            main = div.xpath('./div[1]/div/div[2]/h3/span/a[@class="biz-name"]')
            item = FoodItem()
            url = main.xpath('./@href').extract()
            item['url'] = response.urljoin(url[0])
            item['name'] = main.xpath('./span/text()').extract()[0]
            # pdb.set_trace()
            second = div.xpath('./div[2]')
            address = second.xpath('./address').extract()
            region = second.xpath('./span[@class="neighborhood-str-list"]/text()').extract()
            if address:
                item['address'] = self.filtertags(address[0])
            else:
                item['address'] = ""
            if region:
                item['region'] = (region[0]).strip()
            else:
                item['region'] = ""
            item['city'] = cityName.strip()
            item['category'] = category
            item['location'] = eval(locations[ii])
            yield item

        time.sleep(1.0)
        nextPage = sel.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href').extract()
        if nextPage:
            nextLink = response.urljoin(nextPage[0])
            yield Request(url=nextLink, callback=self.parseBegin, meta={'appends':appends}, dont_filter=True)