我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用scrapy.selector.HtmlXPathSelector()。
def parse(self, response): #print response.url.split('/') #sel=HtmlXPathSelector(response) content=response.xpath('//div[@class="quote"]') for x in content: word= x.xpath('.//span[@class="text"]/text()').extract_first() print '\n' print word yield {'text':word} nextPage=response.css('li.next a::attr(href)').extract_first() if nextPage is not None: goNext=response.urljoin(nextPage) print "Go next: ",goNext yield scrapy.Request(url=goNext,callback=self.parse)
def parse(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() response = response.replace(url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) log.msg("Parse: index level:" + str(index_level)) if index_level in [1, 2, 3, 4]: self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: log.msg('yield process, url:' + url) yield Request(url, callback=self.parse) elif index_level == 5: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['_id'] = linkedin_id personProfile['url'] = UnicodeDammit(response.url).markup yield personProfile
def show(self, response): # print(response) hxs = HtmlXPathSelector(response) news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') for new in news_list: # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract() link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,), method='POST', cookies=self.cookie_dict, callback=self.do_favor ) page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() for page in page_list: page_url = 'http://dig.chouti.com%s' % page import hashlib hash = hashlib.md5() hash.update(bytes(page_url, encoding='utf-8')) key = hash.hexdigest() if key in self.has_request_set: pass else: self.has_request_set[key] = page_url yield Request( url=page_url, method='GET', callback=self.show )
def parse_responsepage(self, response): hxs = HtmlXPathSelector(response) yum = hxs.select('//span') print(yum)
def parse(self,response): url=response.url #url='https://data.btcchina.com/data/ticker?market=all' #hxs=HtmlXPathSelector(response)sss hxs=json.loads(response.body_as_unicode()) item=BTC.items.BtcItem() item['time']=self._get_sys_time() item['now']=hxs['ticker_btccny']['buy'] item['height']=hxs['ticker_btccny']['high'] item['low']=hxs['ticker_btccny']['low'] yield item yield Request(url)
def parse(self,response): self.log("fetch group home page: %s" % response.url) hxs=HtmlXPathSelector(response) item=douban_group.items.DoubanGroupItem() item['groupName']=hxs.select('//*[@id="group-info"]/h1/text()').re('^\s+(.*)\s+$')[0] item['groupUrl']=response.url group_id=self.__get_id_from_group_url(response.url) member_url='https://www.douban.com/group/%s/members' % group_id member_text=hxs.select('//a[contains(@href,"%s")]/text()' % member_url).re('(\d+)') item['totalNumber']=member_text[0] groups=hxs.select('//div[contains(@class,"group-list-item")]') for group in groups: url=group.select('div[contains(@class,"title")]/a/@href').extract()[0] yield Request(url) time.sleep(1) yield item
def parse_blog(self, response): print 'link parseado %s' %response.url hxs = HtmlXPathSelector(response) item = HackerWayItem() item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha return item # Retornando el Item.
def parse_item(self, response): """ Parse a response into a DocumentItem. """ doc_loader = ItemLoader(item=DocumentItem(), response=response) doc_loader.add_value('url', response.url) doc_loader.add_xpath('meta', '//meta[@name=\'description\']/@content') doc_loader.add_value('domain', urlparse(response.url).hostname) doc_loader.add_xpath('title', '//title/text()') hxs = HtmlXPathSelector(response) # For HTML extractions # Extract links # For each link on this page links = [] a_links = hxs.xpath('//a') for link in a_links: link_obj = {} # Extract the link's URL link_str = " ".join(link.xpath('@href').extract()) link_obj['link'] = link_str.replace("\n", "") # Extract the links value link_name_str = " ".join(link.xpath('text()').extract()) link_name_str = link_name_str.replace("\n", "") link_name_str = link_name_str.lstrip() link_name_str = link_name_str.rstrip() link_obj['link_name'] = link_name_str links.append(link_obj) doc_loader.add_value('links', links) # Populate text field title_list = hxs.xpath('//title/text()').extract() title = ' '.join(title_list) body_text = self.html2string(response) text = title + " " + body_text doc_loader.add_value('content', text) doc_loader.add_value('raw_text', text) doc_loader.add_value('raw_title', title) doc_loader.add_value('raw_url', response.url) h1_list = hxs.xpath("//h1/text()").extract() doc_loader.add_value('h1', " ".join(h1_list)) doc_loader.add_value('content_type', response.headers['Content-type']) doc_loader.add_value('updated_on', datetime.datetime.now().strftime( "%Y-%m-%dT%H:%M:%S")) item = doc_loader.load_item() return item