我正在使用scrapy框架来抓取网站,并且无法单击javascript链接来打开另一个页面。
我可以将页面上的代码标识为:
<a class="Page" alt="Click to view job description" title="Click to view job description" href="javascript:sysSubmitForm('frmSR1');">Accountant </a>
谁能建议我如何在Scaroy中执行该javascript并通过我获得另一页,我可以从该页中获取数据。
提前致谢
检出以下有关如何将硒与硒一起使用的摘要。爬网速度会变慢,因为您不仅要下载html,还可以完全访问DOM。
注意:由于以前提供的链接不再起作用,因此我已复制粘贴此代码段。
# Snippet imported from snippets.scrapy.org (which no longer works) from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.http import Request from selenium import selenium class SeleniumSpider(CrawlSpider): name = "SeleniumSpider" start_urls = ["http://www.domain.com"] rules = ( Rule(SgmlLinkExtractor(allow=('\.html', )), callback='parse_page',follow=True), ) def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start() def __del__(self): self.selenium.stop() print self.verificationErrors CrawlSpider.__del__(self) def parse_page(self, response): item = Item() hxs = HtmlXPathSelector(response) #Do some XPath selection with Scrapy hxs.select('//div').extract() sel = self.selenium sel.open(response.url) #Wait for javscript to load in Selenium time.sleep(2.5) #Do some crawling of javascript created content with Selenium sel.get_text("//div") yield item