我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用scrapy.exceptions.CloseSpider()。
def parse_news(self,response): item = response.meta.get("item",None) # #?????????????????????? # news_date = item.get("news_date",None) # if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") # news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # # delta = self.end_now-struct_date # if delta.days == self.end_day: # # pass # raise CloseSpider('today scrapy end') soup = BeautifulSoup(response.body) news_content_group = soup.find("div",class_="entry-content group") #?????? news_content_group.find("div",class_="related_posts").replace_with("") content = news_content_group.text.strip() item["content"] = content item["catalogue"] = u"????" yield item
def parse_article(self,response): #content,news_no,crawl_date item = response.meta.get("item",NewsItem()) # news_date = item.get("news_date",None) # if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # print delta.days # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') soup =BeautifulSoup(response.body) author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None abstract = soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None news_no = response.url.split("/")[-1][:-5] item["author"] = author item["abstract"] = abstract item["content"] = content item["crawl_date"] = NOW item["news_no"] = news_no yield item
def parse_search(self, response): """ @summary: ?????????????request??????? @param response:start_requests()????????????? """ # ???????????????????????"antispider"?? # ????"antispider"??????????????????????????? if "antispider" in response.url: spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url) time.sleep(43200) # ?????????????? raise CloseSpider('antispider') # ext????????????????json????url??????????????? ext = response.xpath( '//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext?? if not ext: spider_logger.error("Faild searching {0} !".format(response.meta['query'])) return # ???????json???url?????????10?????????????1?(page=1????)?url json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord=' cookies = response.meta['cookies'] yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
def spider_opened(self, spider): try: file = open(spider.settings.get('FEED_FILE'), 'wb') except TypeError: raise NotConfigured('FEED_FILE parameter does not string or does not exist') except (IOError, OSError) as e: raise CloseSpider('Cannot open file {}: {}'.format(spider.settings.get('FEED_FILE', None), e)) self.files[spider] = file feed_title = spider.settings.get('FEED_TITLE') if not feed_title: raise NotConfigured('FEED_TITLE parameter does not exist') feed_link = spider.settings.get('FEED_LINK') if not feed_link: raise NotConfigured('FEED_LINK parameter does not exist') feed_description = spider.settings.get('FEED_DESCRIPTION') if feed_description is None: raise NotConfigured('FEED_DESCRIPTION parameter does not exist') feed_exporter = spider.settings.get('FEED_EXPORTER', RssItemExporter) if isinstance(feed_exporter, six.string_types): feed_exporter = load_object(feed_exporter) if not issubclass(feed_exporter, RssItemExporter): raise TypeError("FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format(feed_exporter)) self.exporters[spider] = feed_exporter(file, feed_title, feed_link, feed_description) self.exporters[spider].start_exporting()
def parse(self, response): if response.status ==503: raise CloseSpider("denied by remote server") sel = Selector(response) appends = response.meta['appends'] cityname = appends['city'] smexp = appends['cat'] xpath_exp = '//a[text()="Search for more '+smexp+'"]/@href' if cityname=='??': moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Hong+Kong', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=???%2C+Hong+Kong'] elif cityname=='Adelaide': moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide%2C+Adelaide+South+Australia%2C+Australia', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Adelaide+South+Australia+5000'] elif cityname=='Park La Brea': moreLink = ['http://www.yelp.com/search?cflt='+self.cat+'&find_loc=South+La+Brea+Avenue%2C+Los+Angeles%2C+CA+90056', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=Mid-Wilshire%2C+Los+Angeles%2C+CA', 'http://www.yelp.com/search?cflt='+self.cat+'&find_loc=North+La+Brea+Avenue%2C+Los+Angeles%2C+CA'] else: searchmore = sel.xpath(xpath_exp).extract()[0] moreLink = [response.urljoin(searchmore)] for link in moreLink: yield Request(url=link, callback=self.parseBegin, meta={'appends': appends}, dont_filter=True)
def process_request(self, request, spider): if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'): return proxy = self.proxies.get_random() if not proxy: if self.stop_if_no_proxies: raise CloseSpider("no_proxies") else: logger.warn("No proxies available; marking all proxies " "as unchecked") self.proxies.reset() proxy = self.proxies.get_random() if proxy is None: logger.error("No proxies available even after a reset.") raise CloseSpider("no_proxies_after_reset") request.meta['proxy'] = proxy request.meta['download_slot'] = self.get_proxy_slot(proxy) request.meta['_rotating_proxy'] = True
def parse_news(self,response): #content,news_date,news_no,crawl_date,referer_web item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) soup = BeautifulSoup(response.body) # news_date = item.get("news_date",None) #????????? news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None #http://info.meadin.com/PictureNews/2938_1.shtml Exception if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None #???? art,content = None,None art = soup.find("div",class_="article js-article") if art: #????? art.find("div",class_="intro").replace_with("") content =art.text.strip() news_no =response.url.split("/")[-1].split("_")[0] item["news_date"]=news_date item["content"]=content item["referer_web"]=referer_web item["crawl_date"]=NOW item["news_no"]=news_no item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.warning("can't find news_date.the url is %s" % response.url)
def parse(self, response): #???? html = response.body soup = BeautifulSoup(html,"lxml") #???????? for i in self.fetch_newslist(soup): # raise CloseSpider(str(i['time'] == u"???")) # if i['time'] == "???": raise CloseSpider("today news end") request = scrapy.Request(i['news_url'],callback=self.parse_news) request.meta['item'] = i request.meta['pageindex'] = 1 yield request #???????? lasttime = "nothing" for i in soup.select('div[class="news_li"]'): if i.attrs.has_key("lasttime"): lasttime = i["lasttime"] break #?????url???? # ???load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx= load_chosen = re.search(r'data.:."(.*)".+.masonry',html) page = 2 if load_chosen : tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime) yield scrapy.Request(tp_url, callback=self.next_page_parse)
def start_requests(self): # while len(self.finished) < len(self.all_urls): current_hour = time.strftime("%Y%m%d%H", time.localtime()) if current_hour != START_HOUR: self.logger.info("It's already %s. Stopping..." % current_hour) return for url, item_idx in self.all_urls.iteritems(): if not self.cookies: raise CloseSpider("No enough cookies.") if item_idx in self.finished: continue else: yield Request(url, callback=self.parse_item) # self.logger.info(u'Crawled %s / %s. Done :)' % (len(self.finished), len(self.all_urls)))
def process_response(request, response, spider): if "antispider" in response.url: spider_logger.error("recieve verification code in %s" % response.url) raise CloseSpider('antispider') return response
def __init__(self, query=None, start_time=None, end_time=None, index_pages=None): """ @summary: ?????????, ????????????? @param query: ???,??????? @param start_time: ????????start_time??????????????????????? @param end_time: ????????end_time????? @param index_pages: ????????????? """ # ?????????????????????????? if query: self.query = query # self.query???????????? else: # ??????????????????????? spider_logger.error("Spider need single search word each time!Check input!") raise CloseSpider('invaild search word') # ???????????????100?? if start_time: self.from_time = start_time else: self.from_time = datetime.now()-timedelta(days=100) # ????100?? # ????????????? if end_time: self.end_time = end_time else: self.end_time = datetime.now() # ???????? # ??????? if index_pages: self.index_pages = int(index_pages) else: self.index_pages = 10 # ????10?
def parse_index(self, response): """ @summary: ?????????????????Request?? @param response: parse_search()????????????? @return: list????????????url??????????? """ if "antispider" in response.url: spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url) time.sleep(43200) raise CloseSpider('antispider') requests = [] page_list = self._get_result(response) # ??????????????? if not page_list: return requests next_page = True # ???????? # ??????????????? for item in page_list: if isinstance(item, Request): # ?????Request requests.append(item) next_page = False break if item['publish_time'] <= self.from_time: # ????????self.from_time next_page = False break elif item['publish_time'] > self.end_time: # ????????self.end_time continue else: req = Request(item['url'], self.parse_page) # ??????? req.meta["item"] = item requests.append(req) # ?????,??????Request;??????? if next_page and self._next_result_page(response): cookies = response.meta['cookies'] requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies})) return requests
def parse_page(self, response): """ @summary: ?????? @param response: parse_index()????????????? @return: ?????_finish_item()?????? """ if "antispider" in response.url: spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url) time.sleep(43200) raise CloseSpider('antispider') item = response.meta["item"] return self._finish_item(item, response)
def process_eastmoney_gubalist_item(self, item, spider): status = item.get('status') if status is not None and status != 200: self.error_count += 1 if self.error_count * 5 > self.success_count: raise CloseSpider( 'too many error occurred, shutdown gracefully.') return item if 'ticker_id' not in item or item['ticker_id'] == "": raise DropItem('??ticker_id') self.write_to_file(item, spider.name) pass
def test_empty_feed(self): for partial_settings in itertools.chain.from_iterable( itertools.combinations(self.feed_settings.items(), r) for r in range(1, len(self.feed_settings))): partial_settings = dict(partial_settings) undefined_settings = [name.upper() for name in set(self.feed_settings) - set(partial_settings)] with self.assertRaisesRegexp(NotConfigured, '({})'.format('|'.join(undefined_settings)) if len(undefined_settings) > 1 else undefined_settings[0], msg='The feed file, title, link and description must be specified, but the absence of {} is allowed' .format(undefined_settings)): with CrawlerContext(**partial_settings): pass with self.assertRaises(CloseSpider): feed_settings = dict(self.feed_settings) feed_settings['feed_file'] = 'non/existent/filepath' with CrawlerContext(**feed_settings): pass with CrawlerContext(**self.feed_settings): pass with open(self.feed_settings['feed_file']) as data, \ open(os.path.join(os.path.dirname(__file__), 'expected_rss', 'empty_feed.rss')) as expected: self.assertUnorderedXmlEquivalentOutputs(data.read(), expected.read())
def close_spider(self, reason): raise CloseSpider(reason=reason) # do something before spider close
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False published_at_wib = '' try: # Get list of news from the current page articles = json.loads(response.text) for article in articles['contents']: url = article['friendlyURL'] date = article['publishTime'] published_at_wib = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_updated = True break yield Request('http://pilkada.arah.com' + url, callback=self.parse_news) except: raise CloseSpider('article not found') if is_no_update: self.logger.info('Media have no update') return # Get more try: next_date = published_at_wib - timedelta(seconds=1) if self.media['last_scraped_at'] < wib_to_utc(next_date): yield Request('http://pilkada.arah.com/api/article/8/' + str(next_date)[:19], callback=self.parse) except: pass # Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) has_no_update = False # Get list of news from the current page for article in response.css('.col-sm-16 > .row > .col-sm-16 > .row'): title = article.css('h4::text').extract_first() url = article.css('a::attr(href)').extract_first() time = article.css('.indexTime::text').extract_first() # 16:51 date = article.css('.indexDay::text').extract_first() # Sabtu, 15 Oktober 2016 date = date.split(',')[-1].strip() # 15 Oktober 2016 date_time = date + ' ' + time # 15 Oktober 2016 16:51 date_time = date_time.split(' ') date_time = ' '.join([_(s) for s in date_time]) # Oktober => October # Parse date information try: published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: has_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if has_no_update: self.logger.info('Media have no update') return # Currently has no more pages
def parse_news_pilkada(self, loader, response): date_selector = response.css('.block-judul-artikel > .tanggal::text') try: date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4] date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')]) published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M') except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) if (self.media['last_scraped_at'] >= published_at): is_no_update = True self.logger.info('Media have no update') raise CloseSpider('finished') loader.add_value('published_at', published_at) title_selector = response.css('.block-judul-artikel > .judul-artikel') loader.add_value('title', title_selector.extract()[0]) raw_content_selector = response.css('.block-artikel .p-artikel') raw_content_selector = raw_content_selector.xpath('//p[not(iframe)]') raw_content = '' for rsl in raw_content_selector: raw_content = raw_content + rsl.extract().strip() loader.add_value('raw_content', raw_content) author_name = '' for author_name_selector in reversed(raw_content_selector): author_name_selector = author_name_selector.css('strong::text') for tmp in reversed(author_name_selector.extract()): tmp = tmp.strip() if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp): author_name = tmp break if author_name: break author_name = ','.join(author_name.split(' | ')) loader.add_value('author_name', author_name) loader.add_value('url', response.url)
def parse(self, response): self.logger.info('parse: {}'.format(response)) is_no_update = False # Collect list of news from current page # Note: no next page button on cnnindonesia, all is loaded here article_selectors = response.css('a.list_kontribusi'); if not article_selectors: raise CloseSpider('article_selectors not found') for article in article_selectors: url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example: Jumat, 23/09/2016 21:17 info_selectors = article.css('div.text > div > span.tanggal::text') if not info_selectors: raise CloseSpider('info_selectors not found') info = info_selectors.extract()[0] info_time = info.split(',')[1].strip() # Parse date information try: # Example: 23/09/2016 21:17 published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M') except ValueError as err: raise CloseSpider('cannot_parse_date: {}'.format(err)) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy Request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return
def parse(self, response): self.logger.info('parse: {}'.format(response)) is_no_update = False # Collect list of news from current page articles = json.loads(response.body)['response'] for article in articles: # Example: 2016-10-12 15:16:04 date_time_str = article['news_date_publish'] # Parse date information try: published_at_wib = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S') except Exception as e: raise CloseSpider('cannot_parse_date: {}'.format(e)) published_at = wib_to_utc(published_at_wib) if (self.media['last_scraped_at'] >= published_at): is_no_update = True break; for sub_article in article['news_content']: yield self.parse_news(article, sub_article) if is_no_update: self.logger.info('Media have no update') return # Collect news on next page if len(articles) > 0: # Example: 'http://api.merdeka.com/mobile/gettag/pilgub-dki/0/20/L9pTAoWB269T&-E/' next_page_url = response.url.split('/') next_page_url[-4] = str(int(next_page_url[-4]) + 20) next_page_url = '/'.join(next_page_url) yield Request(next_page_url, callback=self.parse) # Collect news item
def check_error(self): # Stop spider if error has been raised in pipeline if hasattr(self, 'close_error'): raise CloseSpider(self.close_error)
def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and FeedbackSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request')
def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request')
def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request')
def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request')
def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request')
def parse(self, response): data = json.loads(response.body) total = int(data['totalRecord']['num']) total_page = int(math.ceil(total/float(self.page_size))) if total == 0: raise CloseSpider('blocked') for i in self.parse_item(response): yield i for page in range(2, total_page+1): yield Request(url=self.get_url(page), callback=self.parse_item)
def __check_for_close(self): """ Check to see if this spider has been running for longer than the maximum amount of allowed time, and stop the spider if it has. :return: None """ if self._start_time is None: self._start_time = DatetimeHelper.now() elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds() if elapsed_time > self.max_run_time: raise CloseSpider( "Spider run time exceeded maximum time of %s seconds. Closing." % (self.max_run_time,) )
def open_spider(self, spider): site_setting = spider.settings.get('SITE') if not site_setting: error_msg = 'Can not find the website configuration from settings.' spider.logger.error(error_msg) raise CloseSpider(error_msg) self.session = self.session_maker() site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none() if not site: site = LiveTVSite(code=site_setting['code'], name=site_setting['name'], description=site_setting['description'], url=site_setting['url'], image=site_setting['image'], show_seq=site_setting['show_seq']) self.session.add(site) self.session.commit() self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}
def parse_news(self,response): # print response.url,"response" PageKey = response.meta.get("topic_id") PageNumber =response.meta.get("PageNumber") flag_id =str(int(PageKey)-40037910) soup =BeautifulSoup(response.body,"lxml") #2016-07-13 news_date = soup.find("time").text if soup.find("time") else None # print self.flag[flag_id],int(PageNumber) """ ?????????self.flag[flag_id]??0?????????????? ?????????????????????????????? self.flag[flag_id]=???? """ if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]: #??????? struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") # print self.end_now,struct_date,"time" delta = self.end_now-struct_date # print delta.days,"delta day ~~~~~~~~~~~~~~~~" if delta.days > self.end_day: self.flag[str(flag_id)]=int(PageNumber) # print flag_id,"stop ~~~~~~" # raise CloseSpider('today scrapy end') else: head = soup.find("div",class_="post-head") topic,title,abstract=None,None,None if head: topic = head.find("span",class_="category").text if head.find("span",class_="category") else None title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None news_no = response.url.split("/")[-1].split("?")[0] #TODO ????js?????? item = NewsItem(title=title,topic=topic, abstract=abstract,news_date=news_date, content=content,news_no=news_no ,crawl_date=NOW,news_url=response.url,catalogue='????') yield item
def parse(self, response): self.logger.info('parse: {}'.format(response)) is_no_update = False # Collect list of news from current page articles_grid = response.css('li:not(.last) > div.grid') articles = zip(articles_grid, [NEWS_GRID] * len(articles_grid)) articles += zip(response.css('div.topic'), [NEWS_HEADLINE]) if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = None if article[1] == NEWS_GRID: url_selectors = article[0].css('h2 > a::attr(href)') elif article[1] == NEWS_HEADLINE: url_selectors = article[0].css('h1 > a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] self.logger.info('Url: {}'.format(url)) # Example: Minggu, 09 Oct 2016 15:14 info_selectors = article[0].css('div.reg::text') if not info_selectors: raise CloseSpider('info_selectors not found') info = info_selectors.extract()[1] # Example: 09 Oct 2016 15:14 info_time = info.split(',')[1].strip() # Parse date information try: published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: {}'.format(e)) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # Collect news on next page if response.css('div.bu.fr > a'): next_page = response.css('div.bu.fr > a[rel="next"]::attr(href)').extract()[0] next_page_url = response.urljoin(next_page) yield Request(next_page_url, callback=self.parse) # Collect news item
def parse(self, response): self.logger.info('parse: {}'.format(response)) is_no_update = False # Collect list of news from current page article_selectors = response.css('ul.indexlist > li') if not article_selectors: raise CloseSpider('article_selectors not found') for article in article_selectors: url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example: 7 Oktober 2016 19:37 info_selectors = article.css('div.upperdeck::text') if not info_selectors: raise CloseSpider('info_selectors not found') info = info_selectors.extract()[1] info = info.split(',')[1].replace('\t','').strip() # Example: 7 October 2016 19:37 info_time = info.split(' ') info_time = ' '.join([_(s) for s in info_time]) # Parse date information try: published_at_wib = datetime.strptime(info_time, '%d %B %Y %H:%M') except ValueError as err: raise CloseSpider('cannot_parse_date: {}'.format(err)) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy Request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # Collect news on next page tag_selectors = response.css('div.pagination > a') if not tag_selectors: raise CloseSpider('tag_selectors not found') for tag in tag_selectors: more_selectors = tag.css('a::text') if not more_selectors: raise CloseSpider('more_selectors not found') more = more_selectors.extract()[0] if more == 'NEXT': next_page = tag.css('a::attr(href)').extract()[0] next_page_url = response.urljoin(next_page) yield Request(next_page_url, callback=self.parse) # Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('div.view-front > div.view-content > div.views-row') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('span.field-content a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example '19 Oct 2016' info_selectors = article.css('span.field-content::text') if not info_selectors: raise CloseSpider('info_selectors not found') info_time = info_selectors.extract()[1].strip() # Parse date information try: published_at_wib = datetime.strptime(info_time, '%d %b %Y') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request('http://www.qureta.com' + url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return if response.css('li.next'): next_page_url = response.css('li.next > a::attr(href)')[0].extract() yield Request('http://www.qureta.com' + next_page_url, callback=self.parse) # Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('li.media') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example '02 November 2016' date_selectors = article.css('time::text') if not date_selectors: raise CloseSpider('date_selectors not found') # Parse date information try: date = date_selectors.extract()[0].split(' ') # Sanitize month - Indo month to Eng month # Example: 02 Nov 2016 date[1] = sanitize(date[1]) published_at_wib = datetime.strptime(' '.join(date), '%d %b %Y') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request('http:' + url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # try getting next page try: next_page_url = response.xpath( '//section[@class="pagination-numeric"]/span/a/@href')[-1].extract() if next_page_url and next_page_url != response.url: yield Request(next_page_url, callback=self.parse) except: pass # Collect news item
def parse_news_metro(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) date_selector = response.css('.artikel > div.block-tanggal::text') if not date_selector: return self.parse_news_pilkada(loader, response) try: date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4] date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')]) published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M') except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) if (self.media['last_scraped_at'] >= published_at): is_no_update = True self.logger.info('Media have no update') raise CloseSpider('finished') loader.add_value('published_at', published_at) title_selector = response.css('.artikel > h1::text') if not title_selector: return loader.load_item() loader.add_value('title', title_selector.extract()[0]) # Select all p which don't have iframe inside it raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]') if not raw_content_selector: return loader.load_item() raw_content = '' for rsl in raw_content_selector: raw_content = raw_content + rsl.extract().strip() # Go to next page while there is next page button next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href') if next_page_selector: return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content)) loader.add_value('raw_content', raw_content) # The author usually put inside <strong> tag, however, some news is not using <strong> tag. # NOTE: this block of code may need revision in the future author_name = '' for author_name_selector in reversed(raw_content_selector): author_name_selector = author_name_selector.css('strong::text') for tmp in reversed(author_name_selector.extract()): tmp = tmp.strip() if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp): author_name = tmp break if author_name: break author_name = ','.join(author_name.split(' | ')) loader.add_value('author_name', author_name) return loader.load_item()
def parse(self, response): self.logger.info('parse: {}'.format(response)) is_no_update = False for article in response.css('li > div.breaking-title'): # http://metro.sindonews.com/read/1146316/171/penyidik-bareskrim-mulai-dalami-video-dugaan-penistaan-agama-1476179831 url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example 'Kamis, 13 Oktober 2016 - 11:18 WIB' date_time_str_selectors = article.css('p::text') if not date_time_str_selectors: raise CloseSpider('date_time_str_selectors not found') date_time_str = date_time_str_selectors.extract()[0] # Parse date information # Example '13 Oktober 2016 - 11:18' date_time_str = date_time_str.split(',')[1].strip()[:-4] date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M') except Exception as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return for next_button in response.css('.mpaging > ul > li'): if len(next_button.css('a:not(.active) > .fa-angle-right')) > 0: next_page = next_button.css('a::attr(href)').extract()[0] next_page_url = response.urljoin(next_page) yield Request(next_page_url, callback=self.parse) break # Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('article') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example: 'Monday, 24/11/2016 | 13:54' date_selectors = article.css('time::text') if not date_selectors: raise CloseSpider('date_selectors not found') # Parse date information try: date = date_selectors.extract()[0].split(' ') published_at_wib = datetime.strptime(' '.join(date[1:]), '%d/%m/%Y | %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # try getting next page try: next_page_url = response.css('div.pagination > a.next::attr(href)').extract()[0] if next_page_url: yield Request(next_page_url, callback=self.parse) except: pass # Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('div.wp-terhangat > div.item3') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example 'Wednesday, 02 November 2016' date_selectors = article.css('span.date::text') if not date_selectors: raise CloseSpider('date_selectors not found') # Parse date information try: date = date_selectors.extract()[0].split(' ') # Sanitize month - Indo month to Eng month # Example: Wednesday, 02 Nov 2016 date[2] = sanitize(date[2]) published_at_wib = datetime.strptime(' '.join(date[1:]), '%d %b %Y') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) # if it's news from before 2015, drop them if self.media['last_scraped_at'] >= published_at or int(date[-1]) < 2015: is_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # try getting next page try: next_page_url = response.css('nav > ul > li > a::attr(href)').extract()[-1] if next_page_url: yield Request(next_page_url, callback=self.parse) except: pass # Collect news item
def parse(self, response): is_no_update = False news_selector = response.css("ul.clearfix > li > div.tleft") if not news_selector: raise CloseSpider('news_selectors not found') for news in news_selector: url_selectors = news.css("div.tleft > h3 > a::attr(href)") if not url_selectors: raise CloseSpider('url_selectors not found') # http://megapolitan.kompas.com/read/xml/2016/10/18/17244781/ini.alat.peraga.kampanye.yang.boleh.dibuat.cagub-cawagub.dki # http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.2016.10.15.07300081&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1 url = url_selectors.extract()[0] url = 'http://api.kompas.com/external/?type=readdua&kanal=home&command=.xml.' + '.'.join(url.split('/')[-5:-1]) + '&format=json&APPSF0UNDRYBYPASS=%20HTTP/1.1' date_selectors = news.css("div.grey.small::text") if not date_selectors: raise CloseSpider('date_selectors not found') raw_date = date_selectors.extract()[0] # Parse date information try: published_at = self.convert_date(raw_date); except Exception as e: raise CloseSpider('cannot_parse_date: %s' % e) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request(url=url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # For kompas case, we don't rely on the pagination # Their pagination is max 17 pages, the truth is they have 25 pages if self.first_time: template_url = 'http://lipsus.kompas.com/topikpilihanlist/3754/{}/Pilkada.DKI.2017' for i in xrange(25): page = i + 1 next_url = template_url.format(page) yield Request(next_url, callback=self.parse) self.first_time = False
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('div.article-snippet__info') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('a::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] info_selectors = article.css('div.article-snippet__date') info_selectors = info_selectors.css('.timeago::text') if not info_selectors: raise CloseSpider('info_selectors not found') # Example '13 Okt 2016 16:10' info_time = info_selectors.extract()[0] # Example '13 Oct 2016 16:10' info_time = ' '.join([_(w) for w in info_time.split(' ')]) # Parse date information try: published_at_wib = datetime.strptime(info_time, '%d %b %Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: {}'.format(e)) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy Request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # TODO: Collect news item
def parse(self, response): self.logger.info('parse: %s' % response) is_no_update = False # Get list of news from the current page articles = response.css('article > div > div.post-content') if not articles: raise CloseSpider('article not found') for article in articles: # Close the spider if we don't find the list of urls url_selectors = article.css('a.timestamp-link::attr(href)') if not url_selectors: raise CloseSpider('url_selectors not found') url = url_selectors.extract()[0] # Example 'Sabtu, November 19, 2016' date_selectors = article.css('a.timestamp-link > abbr::text') if not date_selectors: raise CloseSpider('date_selectors not found') # Parse date information try: date = date_selectors.extract()[0].split(' ') # Sanitize month - Indo month to Eng month # Example: Nov 19 2016 date[1] = sanitize(date[1]) published_at_wib = datetime.strptime(' '.join(date[1:]), '%b %d, %Y') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) published_at = wib_to_utc(published_at_wib) if self.media['last_scraped_at'] >= published_at: is_no_update = True break # For each url we create new scrapy request yield Request(url, callback=self.parse_news) if is_no_update: self.logger.info('Media have no update') return # try getting next page if len(articles) > 0: try: yield Request('http://www.nusanews.co/search/label/Pilkada?updated-max=' + str(published_at_wib).replace(' ','T') + '%2B07:00&max-results=20', callback=self.parse) except Exception as e: pass # Collect news item
def parseBegin(self, response): if response.status ==503: raise CloseSpider("denied by remote server") sel = Selector(response) appends = response.meta['appends'] cityName = appends['city'] category = appends['cat'] locations = self.getLocations(response.body) if locations == []: # self.logger.error("location is []: %s\t%s", response.url, str(cityName)) return div_a = sel.xpath('//li[@class="regular-search-result"]/div/div[@class="biz-listing-large"]') for ii, div in enumerate(div_a): # pdb.set_trace() main = div.xpath('./div[1]/div/div[2]/h3/span/a[@class="biz-name"]') item = FoodItem() url = main.xpath('./@href').extract() item['url'] = response.urljoin(url[0]) item['name'] = main.xpath('./span/text()').extract()[0] # pdb.set_trace() second = div.xpath('./div[2]') address = second.xpath('./address').extract() region = second.xpath('./span[@class="neighborhood-str-list"]/text()').extract() if address: item['address'] = self.filtertags(address[0]) else: item['address'] = "" if region: item['region'] = (region[0]).strip() else: item['region'] = "" item['city'] = cityName.strip() item['category'] = category item['location'] = eval(locations[ii]) yield item time.sleep(1.0) nextPage = sel.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href').extract() if nextPage: nextLink = response.urljoin(nextPage[0]) yield Request(url=nextLink, callback=self.parseBegin, meta={'appends':appends}, dont_filter=True)