我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用lxml.etree.HTMLParser()。
def set_nasa_wallpaper(): st = datetime.fromtimestamp(time.time()).strftime('%y%m%d') url = URL07.format(st) r = requests.get(url) if r.status_code == 200: try: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) images = html.iter('img') if images is not None: images = list(images) if len(images) > 0: image_url = images[0].getparent().attrib['href'] image_url = 'https://apod.nasa.gov/' + image_url if download(image_url) is True: set_background(comun.POTD) except Exception as e: print(e)
def debug_page(): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0' } url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818' r = requests.get(url=url, headers=headers) #r.encoding='gbk' print r.status_code print type(r.content) print r.content #print chardet.detect(r) tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8')) #print etree.tostring(tree) return tree,r.text # ????????header??
def get_account_names(saml_assertion): saml_url = "https://signin.aws.amazon.com:443/saml" headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' } response = requests.post(saml_url, headers=headers, data={ 'SAMLResponse': saml_assertion.assertion }) response.raise_for_status() html_response = ET.fromstring(response.text, ET.HTMLParser()) account_names = {} for element in html_response.findall('.//div[@class="saml-account-name"]'): account_id = element.text.split(' ')[2].replace('(', '').replace(')', '') account_name = element.text.split(' ')[1] account_names[account_id] = account_name return account_names
def list_courses(self): ''' List courses available in Studio site ''' self.ensure_studio_site() url = "%s/home/" % self.BASE ret = self.ses.get(url) parser = etree.HTMLParser() xml = etree.parse(StringIO(ret.content), parser).getroot() courses = [] course_ids = [] for course in xml.findall('.//li[@class="course-item"]'): cid = course.get("data-course-key") if self.verbose: print cid # etree.tostring(course) courses.append(course) course_ids.append(cid) return {'xml': courses, 'course_ids': course_ids, }
def _get_block_child_info_from_content_preview(self, block_id): ''' Get child info dict from content preview ''' xblock = self.get_xblock(usage_key=block_id, view="container_preview") html = xblock['html'] parser = etree.HTMLParser() xml = etree.parse(StringIO(html), parser).getroot() ids =[] child_blocks = [] for elem in xml.findall('.//li[@class="studio-xblock-wrapper is-draggable"]'): cid = elem.get('data-locator') ids.append(cid) child_blocks.append(self.get_xblock(usage_key=cid)) child_info = {'children': child_blocks, 'child_ids': ids, } return child_info
def validate(self, source, parse_strict=False, filename=None): """ Validate a hocr document Args: source (str): A filename or '-' to read from STDIN parse_strict (bool): Whether to be strict about broken HTML. Default: False filename (str): Filename to use in the reports. Set this if reading from STDIN for nicer output """ parser = etree.HTMLParser(recover=parse_strict) if not filename: filename = source if source == '-': source = sys.stdin doc = etree.parse(source, parser) root = doc.getroot() report = HocrValidator.Report(filename) try: self.spec.check(report, root) except ValueError as e: sys.stderr.write("Validation errored\n") return report
def get_xml_data(req_string, headers, data=None): req = urllib2.Request(req_string, headers=headers) html_data = _get_html_data(req, data) # Clean chunked data html_data = clean_chunked_data(html_data) #log_user_action(req.get_host() ,'chunked data', html_data, {}) try: data = etree.fromstring(html_data) except XMLSyntaxError: # lxml cannot handle encoding declarations :( data = etree.HTML(html_data, etree.HTMLParser()) # data is None when it was not XML, like 404 page without 404 code if data is not None: data = data.getroottree() else: raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None) # TODO: check valid #if not data.find('.//prestashop'): # raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None) return data
def _a_page_of_expired_login(): return ET.fromstring( ''' <!DOCTYPE html> <html> <head> <title>Amazon Web Services Sign-In</title> <meta name="viewport" content="width=device-width" /> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head> <body> <div id="container"> <h1 class="background">Amazon Web Services Login</h1> <div id="content"> <div id="main_error"></div> <form id="saml_form" name="saml_form" action="/saml" method="post"> <input type="hidden" name="RelayState" value="" /> <input type="hidden" name="SAMLResponse" value="" /> <input type="hidden" name="name" value="" /> <p style="font-size: 16px; padding-left: 20px;">Select a role:</p> </div> </body> </html> ''', ET.HTMLParser(), )
def parse_home(self, home_content): if home_content is None: return None home_content = home_content.encode('ISO-8859-1').decode('gbk') html = etree.HTML(home_content, parser=etree.HTMLParser(encoding='utf-8')) alinks = html.xpath('//a[@href]') pattern_capture = re.compile(ur"?(\d{6})?(.+)") l = [] for alink in alinks: aa = alink.text if aa != None: match = pattern_capture.match(aa) if match: #????,??????? # l.append((match.group(1), match.group(2))) l.append(match.group(1)) return l #?????????????,???????,????dict?,????????,?????????????
def parse_ratio(self, info, content): # content = content.split('"')[1] html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) tds = html.xpath('//td[@class="tor"]') if len(tds) > 2: #??????,?????---? #???????????????????????????????????http://fund.eastmoney.com/f10/cyrjg_510090.html?????>???????????????+??=100%????<=???????????? insito = tds[0].text if insito != '---': info.inratio += safe_to_float(insito.split("%")[0]) # innerto = tds[2].text # if innerto != '---': # self.inratio += safe_to_float(innerto.split("%")[0]) # self.inratio = safe_to_float(.split('%')[0]) + safe_to_float(tds[2].text.split('%')[0]) #?????????,????????
def parse_stocks(self, info, content): html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) #????????????,????????????? tbs = html.xpath('//table[@class="w782 comm tzxq"]') # pers = html.xpath('//table[@class="w782 comm tzxq"]') if len(tbs) > 0: #???????,????? stocktds = tbs[0].xpath('.//td[@class="tol"]/a') pers = tbs[0].xpath('.//td[@class="tor"]') # ???????????,????,?????5??? front, interval = 2, 5 if not '???' in content: front, interval = 0, 3 for (index, stocked) in enumerate(stocktds): # info.stocks.append(stocked.text) # tor????,????????? per = pers[index*interval+front] # ???????? "???????????????????????????????" ???????? if per == '---': continue # ?????????,??[????-3.6%,????-4.1%]????? # ????????bug,?????,?????? stockname = stocked.text if not stockname is None and len(stockname) > 0: info.stocks.append(stockname + '-' + per.text)
def parse_index_list(self, index_list_content): # ???????? index_list_content = index_list_content.encode('ISO-8859-1').decode('utf-8') parsed_content = etree.HTML(index_list_content, parser=etree.HTMLParser(encoding='utf-8')) trs = parsed_content.xpath('//tbody/tr') indexs = [] for tr in trs: tds = tr.xpath('./td') if len(tds) == 5: index = IndexInfo() code = tds[0].text.strip() if len(code.split('.')) == 2: index.code = code.split('.')[0] index.full_code = code index.name = tds[1].text.strip() index.begin_time = tds[2].text.strip() index.short_name = tds[3].text.strip() #????url,???????? weave = tds[4].xpath('./a') if len(weave) == 1: index.weave = weave[0].attrib['href'].strip() else: index.weave = tds[4].text.strip() indexs.append(index) return indexs
def get_tree(self, html): if not hasattr(self, '_tree'): # Pre-parse parser = etree.HTMLParser() html = etree.parse(BytesIO(html), parser).getroot() self._tree = Premoler( html, exclude_pseudoclasses=True, method="html", preserve_internal_links=True, base_path=self.kwargs.get('staticpath', '.'), include_star_selectors=True, strip_important=False, disable_validation=True, media_rules=self.kwargs.get('media_rules', []) ).transform() return self._tree
def get_cats_sync(full_urls=False, verbose=False): "Generate category URLs for free O'Reilly ebooks." base_url = 'http://shop.oreilly.com' url = base_url + '/category/ebooks.do' if verbose: print(url) p = etree.HTMLParser() tree = etree.parse(url, parser=p) xpath_expr = '//a[starts-with(@href, "/category/ebooks/")]/@href' cat_urls = tree.xpath(xpath_expr) cat_urls = [base_url + u for u in cat_urls if u.endswith('.do')] for u in cat_urls: if verbose: print(u) tree1 = etree.parse(u, parser=p) urls = tree1.xpath(xpath_expr) for u in urls: if not u.endswith('.do'): continue if full_urls: yield base_url + u else: pat = 'category/ebooks/(.*?).do' yield re.findall(pat, u)[0]
def xpath_selector(selector, html): """ Returns Xpath match for `selector` within `html`. :param selector: XPath string :param html: Unicode content """ from lxml import etree # lxml requires argument to be bytes # see https://github.com/kibitzr/kibitzr/issues/47 encoded = html.encode('utf-8') root = etree.fromstring(encoded, parser=etree.HTMLParser()) elements = root.xpath(selector) if elements: return True, etree.tostring( next(iter(elements)), method='html', pretty_print=True, encoding='unicode', ) else: logger.warning('XPath selector not found: %r', selector) return False, html
def _get_elements_by_xpath(filter_, data, expression): try: from lxml import etree except ImportError: raise common.FilterError(filter_, "module lxml not found") # pylint: disable=no-member html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) document = etree.fromstringlist([data], html_parser) for elem in document.xpath(expression): # pylint: disable=protected-access if isinstance(elem, etree._Element): text = etree.tostring(elem) else: text = str(elem) if isinstance(text, str): yield text else: yield text.decode('utf-8')
def _filter(self, item: str, result: common.Result) -> ty.Iterable[str]: try: from lxml import etree except ImportError: raise common.FilterError(self, "module lxml not found") # pylint: disable=no-member html_parser = etree.HTMLParser(encoding='utf-8', recover=True, strip_cdata=True) document = etree.fromstringlist([item], html_parser) for elem in document.findall(".//*[@id='" + self._conf["sel"] + "']"): # pylint: disable=protected-access if isinstance(elem, etree._Element): text = etree.tostring(elem) # type: ty.Union[str, bytes] if text: if hasattr(text, 'decode'): yield text.decode('utf-8') else: yield str(text) else: yield str(elem)
def get_individual_atlantsolia_prices(): relation = glob.ATLANTSOLIA_LOCATION_RELATION url = 'http://atlantsolia.is/stodvarverd.aspx' res = requests.get(url, headers=utils.headers()) html_text = res.content html = etree.fromstring(html_text, etree.HTMLParser()) div_prices = html.find(('.//*[@id="content"]/div/div/div/div[2]/div/div/' 'table/tbody')) prices = {} for div_price in div_prices: key = relation[div_price[0][0].text] bensin95 = float(div_price[1][0].text.replace(',', '.')) diesel = float(div_price[2][0].text.replace(',', '.')) bensin95_discount = bensin95 - glob.ATLANTSOLIA_MINIMUM_DISCOUNT diesel_discount = diesel - glob.ATLANTSOLIA_MINIMUM_DISCOUNT prices[key] = { 'bensin95': bensin95, 'diesel': diesel, 'bensin95_discount': int(bensin95_discount * 10) / 10.0, 'diesel_discount': int(diesel_discount * 10) / 10.0 } return prices
def get_global_skeljungur_prices(): url = 'http://www.skeljungur.is/einstaklingar/eldsneytisverd/' res = requests.get(url, headers=utils.headers()) html = etree.fromstring(res.content, etree.HTMLParser()) bensin95_text = html.find(('.//*[@id="st-container"]/div/div/div/div/' 'div[2]/div/div/div[1]/div[1]/div[1]/section/' 'div/div[2]/div[1]/div[2]/h2')).text diesel_text = html.find(('.//*[@id="st-container"]/div/div/div/div/div[2]/' 'div/div/div[1]/div[1]/div[1]/section/div/div[2]/' 'div[1]/div[4]/h2')).text bensin95 = float(bensin95_text.replace(' kr.', '').replace(',', '.')) diesel = float(diesel_text.replace(' kr.', '').replace(',', '.')) return { 'bensin95': bensin95, 'diesel': diesel, 'bensin95_discount': bensin95 - glob.SKELJUNGUR_DISCOUNT, 'diesel_discount': diesel - glob.SKELJUNGUR_DISCOUNT }
def bandcamp_markup_for_url(urlstr): url = urlparse.urlparse(urlstr) parser = etree.HTMLParser(no_network=False) req = urllib2.urlopen(urlstr) tree = etree.parse(req, parser) embed_meta = tree.xpath('//meta[@property="og:video:secure_url"]') embed_url = embed_meta[0].get('content') markup = ('<iframe class="bandcamp-embed" ' + 'src="%s" ' % embed_url + 'seamless>' + '<a href="%s">Embedded Bandcamp Link</a>' % urlstr + '</iframe>') return markup
def extract(self): d = self.data res = None if not d: return d elif isinstance(d, basestring): if d.startswith('http'): ## ??????? res = download_to_oss(d, OSS2_CONF["IMAGES_PATH"]) else: ## ?????????? htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(d), htmlparser) # ??????src srcs = tree.xpath("//img[starts-with(@src,'http')]/@src") data_srcs = tree.xpath("//img[starts-with(@data-src,'http')]/@data-src") srcs = list(set(srcs + data_srcs)) # ?????OSS? new_srcs = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"]) for item in srcs] # ?????????src res = replace_all(d, srcs, new_srcs) elif isinstance(d, list): res = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"]) for item in d] return res
def find_element_by_xpath(self, xpath): """ Find an element in the DOM by xpath. Args: xpath: Returns: WebElement """ tree = etree.fromstring(self.current_response, etree.HTMLParser()) element = tree.xpath(xpath)[0] self.current_response = etree.tostring(element) print(self.current_response) return WebElement(None, self.current_response, self.current_url, parent=self.id) # %%%%%%%%%%%%%%%%%%% Find elements %%%%%%%%%%%%%%%%%%% #
def find_elements_by_xpath(self, xpath): """ Find all elements in the DOM matching the given xpath. Args: xpath: Returns: list """ tree = etree.fromstring(self.current_response, etree.HTMLParser()) elements = tree.xpath(xpath) output_elements = [] for element in elements: resp = etree.tostring(element) output_elements.append(WebElement(None, resp, self.current_url, parent=self.id)) return output_elements
def __init__(self, url, pageEncoding=ENCODING): self.url = url self.html = restClient.get(self.url) if self.html is not None: self.valid = True self.encode = pageEncoding self.parser = etree.HTMLParser(encoding=self.encode) self.tree = etree.HTML(self.html, parser=self.parser) else: self.valid = False raise ValueError('could not fetch data from: ""'+self.url+'""')
def set_fstoppers_wallpaper(): r = requests.get(URL05) url = None image_url = None if r.status_code == 200: try: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) print(etree.tostring(html)) print('===========') for element in html.iter('img'): # print(element.tag, element.attrib, element.text) try: print(element.attrib['data-original']) url = 'https://fstoppers.com' +\ element.getparent().attrib['href'] break except Exception as e: print(e) if url is not None: print(url) r = requests.get(url) if r.status_code == 200: html = etree.HTML(r.content, parser) print(etree.tostring(html)) for element in html.iter('div'): try: if element.attrib['class'] == 'photo': image_url = element.attrib['data-xlarge'] break except Exception as e: print(e) except Exception as e: print(e) if image_url is not None: if download(image_url) is True: set_background(comun.POTD)
def web(self, path, args={}, encoding=None, allow_return_none=False): self.logger.debug('????????') if allow_return_none: if path in self.web_cache and self.web_cache[path] == args: self.logger.debug('????? {} ????'.format(path)) self.logger.debug('???{}'.format(args)) return self.web_cache[path] = dict(args) url = urllib.parse.urljoin(self.web_url, urllib.parse.quote(path)) if len(args) > 0: url += '?' + urllib.parse.urlencode(args) self.logger.debug('HTTP ?????{}'.format(url)) data = io.BytesIO() self.curl.setopt(pycurl.URL, url) self.curl.setopt(pycurl.COOKIE, self.web_cookie) self.curl.setopt(pycurl.NOBODY, False) self.curl.setopt(pycurl.NOPROGRESS, True) self.curl.setopt(pycurl.WRITEDATA, data) self.curl.setopt(pycurl.HEADERFUNCTION, lambda *x: None) self.curl.setopt(pycurl.XFERINFOFUNCTION, lambda *x: None) self.curl.perform() status = self.curl.getinfo(pycurl.RESPONSE_CODE) if status != 200: raise ServerError(status) data.seek(io.SEEK_SET) return etree.parse(data, etree.HTMLParser( encoding=encoding, remove_comments=True))
def parsehtml(): """ Test HTML parsing. >>> # p = HTMLTreeBuilder.TreeBuilder() >>> p = ElementTree.HTMLParser() >>> p.feed("<p><p>spam<b>egg</b></p>") >>> serialize(p.close()) '<p>spam<b>egg</b></p>' """ # doesn't work with lxml.etree
def get_basic_course_info(self): ''' Get basic course info (start date, end date, ...) from instructor dashboard ''' url = "%s#view-course_info" % self.instructor_dashboard_url ret = self.ses.get(url) # print ret.content parser = etree.HTMLParser() xml = etree.parse(StringIO(ret.content), parser).getroot() bci_div = xml.find('.//div[@class="basic-wrapper"]') if bci_div is None: return None fields = ["course-organization", "course-number", "course-name", "course-display-name", "course-start-date", "course-end-date", "course-started", "course-num-sections", "grade-cutoffs"] # look for elements like: <li class="field text is-not-editable" id="field-grade-cutoffs"> data = {} for field in fields: felem = bci_div.find('.//li[@id="field-%s"]' % field) if felem is None: data[field] = None else: belem = felem.find('b') data[field] = belem.text if self.verbose: print json.dumps(data, indent=4) return data
def _extract_urls(self, content, base_url): ''' Get the URLs out of a WAF index page ''' try: parser = etree.HTMLParser() tree = etree.fromstring(content, parser=parser) except Exception, inst: msg = 'Couldn\'t parse content into a tree: %s: %s' \ % (inst, content) raise Exception(msg) urls = [] for url in tree.xpath('//a/@href'): url = url.strip() if not url: continue if '?' in url: log.debug('Ignoring link in WAF because it has "?": %s', url) continue if '/' in url: log.debug('Ignoring link in WAF because it has "/": %s', url) continue if '#' in url: log.debug('Ignoring link in WAF because it has "#": %s', url) continue if 'mailto:' in url: log.debug('Ignoring link in WAF because it has "mailto:": %s', url) continue log.debug('WAF contains file: %s', url) urls.append(url) base_url = base_url.rstrip('/').split('/') if 'index' in base_url[-1]: base_url.pop() base_url = '/'.join(base_url) base_url += '/' log.debug('WAF base URL: %s', base_url) return [base_url + i for i in urls]
def get_src_link_number(url, Html): ''' ??Html???src???????css?js?pic?html?url?? ''' Html = utf8_transfer(Html) page = etree.HTML(Html, parser=etree.HTMLParser(encoding='utf-8')) src_link_list = get_src_links(page, Html) src_link_number = {} for src_num in src_link_list: src_link_number[src_num[0]] = get_link_number(url, src_num[1]) return src_link_number
def parse_item(self, response): super(NextBigWhatSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//header[contains(@class, 'entry-header')]/h1/text()") details = tree.xpath('.//div[contains(@class, "herald-entry-content")]/p/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) img_urls = tree.xpath('.//div[contains(@class, "herald-post-thumbnail herald-post-thumbnail-single")]/span/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] return news_item except: pass return None
def parse_item(self, response): super(BusinessStandardSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()") details = tree.xpath('.//span[contains(@class,\'p-content\')]/div//text()[not(ancestor::script)]') if title and details: news_item['source'] = self.name news_item['source_url'] = response.url.split('?')[0] news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details]) img_urls = tree.xpath('.//img[contains(@class,\'imgCont\')]/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) published_date = tree.xpath('.//p[contains(@class,\'fL\')]//span//text()') if published_date: news_item['published_date'] = datetime.strptime(published_date[3].split("\t")[0], '%B %d, %Y') related = tree.xpath('.//div[contains(@class,\'readmore_tagBG\')]//h2//a/text()') if related: news_item['tags'] = [item.strip() for item in related if item.strip()] cover_image = tree.xpath('.//img[contains(@class,\'imgCont\')]/@src') if cover_image: news_item['cover_image'] = cover_image return news_item except: pass return None
def parse_item(self, response): super(TechCrunchSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1[contains(@class,\'alpha tweet-title\')]//text()") details = tree.xpath('.//div[contains(@class,\'article-entry text\')]//p//text()') if title and details: news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([ det.strip().encode('ascii','ignore') for det in details]) news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] img_urls = tree.xpath('.//div[contains(@class,\'article-entry text\')]/img/@src') if img_urls: news_item['img_urls'] = img_urls cover_image = tree.xpath('.//div[contains(@class,\'article-entry text\')]/img/@src') if cover_image: news_item['cover_image'] = cover_image[0] author = tree.xpath('/html/body/div[4]/article/div/div[1]/div/header/div[2]/div[1]/a/text()') if author : news_item['author'] = author return news_item except: pass return None
def parse_item(self, response): super(SmallBizTrendsSpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//div[@class='post-inner']/h1/text()") details = tree.xpath('.//div[@class=\"entry\"]/p/text()') if title and details: news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['source_url'] = response.url.split('?')[0] news_item['title'] = title[0].strip().decode('unicode_escape').encode('ascii','ignore') news_item['details'] = '\t'.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()]) # ' '.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()]) if tree.xpath('.//span[@class=\'full-span-featured-image\']/span/img/@src'): news_item['img_urls'] = tree.xpath('.//span[@class=\'full-span-featured-image\']/span/img/@src') elif tree.xpath('.//img[contains(@class,\'size-full\')]/@src'): news_item['img_urls'] = tree.xpath('.//img[contains(@class,\'size-full\')]/@src') elif tree.xpath('.//img[contains(@class,\'aligncenter\')]/@src'): news_item['img_urls'] = tree.xpath('.//img[contains(@class,\'aligncenter\')]/@src') meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] published_date = tree.xpath('.//span[contains(@class,\'article-date\')]/text()') if published_date: news_item['published_date'] = datetime.strptime(published_date[0], '%b %d, %Y') author = tree.xpath('.//span[contains(@itemprop,\'name\')]/a/text()') if author: news_item['author'] = author return news_item except: pass return None
def parse_item(self, response): super(DealCurrySpider, self).parse_item(response) htmlparser = etree.HTMLParser() tree = etree.parse(BytesIO(response.body), htmlparser) news_item = NewsItem() try: title = tree.xpath(".//h1/text()") details = tree.xpath('.//div[contains(@class, "articleSpacer")]/p//text()') if title and details: news_item['source_url'] = response.url.split('?')[0] news_item['source'] = self.name news_item['crawled_date'] = datetime.now() news_item['title'] = title[0].strip().encode('ascii','ignore') news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip() # "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()]) tags = tree.xpath('.//div[contains(@style, "padding-bottom:10px")]/span[contains(@style, "color:#346f9a; float:left; text-align:left")]/a/text()') news_item['tags'] = tags[0].strip().encode('ascii','ignore') published_date = tree.xpath(".//span[contains(@style, 'color:#6b6b6b;float:left; text-align:left; margin-left:5px')]/text()") news_item['published_date'] = datetime.strptime(published_date[0].encode('ascii','ignore'), '%d %B %Y') author = tree.xpath('.//div[contains(@style, "")]/span[contains(@style, "color:#6b6b6b; float:left; text-align:left;")]/text()') news_item['author'] = author[0].split('by')[1].strip().encode('ascii','ignore') img_urls = tree.xpath('.//div[contains(@style, "padding-bottom:10px")]/img/@src') if img_urls: news_item['img_urls'] = get_stripped_list(img_urls) meta_result = self.get_meta(tree) if 'description' in meta_result: news_item['blurb'] = meta_result['description'] return news_item except: pass return None
def _retrieve_roles_page(roles_page_url, context, session, ssl_verification_enabled, vip_security_code): response = session.post( roles_page_url, verify=ssl_verification_enabled, allow_redirects=True, data={ 'AuthMethod': 'VIPAuthenticationProviderWindowsAccountName', 'Context': context, 'security_code': vip_security_code, } ) logging.debug(u'''Request: * url: {} * headers: {} Response: * status: {} * headers: {} * body: {} '''.format(roles_page_url, response.request.headers, response.status_code, response.headers, response.text)) if response.status_code != 200: raise click.ClickException( u'Issues during redirection to aws roles page. The error response {}'.format( response ) ) html_response = ET.fromstring(response.text, ET.HTMLParser()) return roles_assertion_extractor.extract(html_response)
def _retrieve_roles_page(roles_page_url, context, session, ssl_verification_enabled, signed_response): logging.debug('context: {}'.format(context)) logging.debug('sig_response: {}'.format(signed_response)) response = session.post( roles_page_url, verify=ssl_verification_enabled, headers=_headers, allow_redirects=True, data={ 'AuthMethod': 'DuoAdfsAdapter', 'Context': context, 'sig_response': signed_response, } ) logging.debug(u'''Request: * url: {} * headers: {} Response: * status: {} * headers: {} * body: {} '''.format(roles_page_url, response.request.headers, response.status_code, response.headers, response.text)) if response.status_code != 200: raise click.ClickException( u'Issues during redirection to aws roles page. The error response {}'.format( response ) ) html_response = ET.fromstring(response.text, ET.HTMLParser()) return roles_assertion_extractor.extract(html_response)
def _strategy(response, config, session): html_response = ET.fromstring(response.text, ET.HTMLParser()) def _plain_extractor(): def extract(): return roles_assertion_extractor.extract(html_response) return extract def _duo_extractor(): def extract(): return duo_auth.extract(html_response, config.ssl_verification, session) return extract def _symantec_vip_extractor(): def extract(): return symantec_vip_access.extract(html_response, config.ssl_verification, session) return extract chosen_strategy = _plain_extractor if _is_duo_authentication(html_response): chosen_strategy = _duo_extractor elif _is_symantec_vip_authentication(html_response): chosen_strategy = _symantec_vip_extractor return chosen_strategy()
def parse_statistic(self, info, content): html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) nums = html.xpath(u'//table[@class="fxtb"]//td') length = len(nums) if length % 4 == 3: nums = nums[0:length - 3] length -= 3 if (length > 0 and length%4==0): for i in range(0, length/4): tds = nums[i*4:(i+1)*4] #??????????,?????? if tds[0].text == FundInfo.STD_CHINESE_KEY: stds = reversed(tds[1:4]) for stdnum in stds: #??1,2,3????,????????1?????,?????,?????????,?? if stdnum.text != '--': info.std = safe_to_float(stdnum.text.split('%')[0]) break elif tds[0].text == FundInfo.SHARPERATIO_CHINESE_KEY: sharpes = reversed(tds[1:4]) for sharpenum in sharpes: if sharpenum.text != '--': info.sharperatio = safe_to_float(sharpenum.text) break elif tds[0].text == FundInfo.INFORATIO_CHINESE_KEY: infos = reversed(tds[1:4]) for infonum in infos: if infonum.text != '--': info.inforatio = safe_to_float(infonum.text) break #????????????? trackbias = html.xpath(u'//div[@id="jjzsfj"]//table[@class="fxtb"]//td') if len(trackbias) == 3: info.bias = safe_to_float(trackbias[1].text.split('%')[0]) #????,??????? styles = html.xpath('//table[@class="fgtb"]//td') if len(styles) >= 2: #?????????? info.style = styles[1].text.strip()
def parse_annual(self, info, content): #??????,?????,?????????,????????????0??,?????? #????????????? html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) trs = html.xpath('//table/tbody/tr') #??????,?????? if len(trs)==5: yieldtds = trs[0].xpath('./td') #?????? yieldvalue = 1.0 yieldpow = 0.0 #????,????????? for yearyield in yieldtds[1:]: y = yearyield.text if y != '---': yieldvalue *= (1 + safe_to_float(y.split('%')[0]) / 100) yieldpow += 1 #??????????? if yieldpow != 0.0: info.annualyield = yieldvalue ** (1.0/yieldpow) - 1 ranktds = trs[3].xpath('./td') rankcount = 0 rankvalue = 0.0 for ranktd in ranktds[1:]: r = ''.join(ranktd.itertext()).strip() if r != '---': rankvalue += safe_to_float(r.split('|')[0]) / safe_to_float(r.split('|')[1]) rankcount += 1 if rankcount > 0: info.annualrank = rankvalue / rankcount
def parse_stock(self, content): stock = StockInfo() html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) ths = html.xpath('//th[@class="tips-fieldnameL"]') tds = html.xpath('//td[contains(@class, "tips-dataL")]') for (index, th) in enumerate(ths): key = th.text.strip() value = tds[index].text.strip() if value == '--': value = '' if key == StockInfo.FULL_NAME_CHINESE_KEY: stock.fullname = value elif key == StockInfo.USED_NAME_CHINESE_KEY: #????? stock.used_names = value.split('->') elif key == StockInfo.CODE_CHINESE_KEY: stock.code = value elif key == StockInfo.SHORT_NAME_CHINESE_KEY: stock.shortname = value elif key == StockInfo.MARKET_CHINESE_KEY: stock.market = value elif key == StockInfo.INDUSTRY_CHINESE_KEY: stock.industry = value elif key == StockInfo.AREA_CHINESE_KEY: stock.area = value # ???,?????????????,??????????????, elif key == StockInfo.RELEASE_DATE_CHINESE_KEY: stock.releasedate = value return stock
def main(): global parser parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--output-html', help='Output result page HTML to file') parser.add_argument('--saisies', dest='saisie_variables', metavar='nom=valeur', nargs='+', help='Variables saisies') parser.add_argument('--year', default='2015', type=int, help='Calculer les impôts de l\'année N sur les revenus de l\'année N-1') args = parser.parse_args() cgi_url = 'http://www3.finances.gouv.fr/cgi-bin/calc-{}.cgi'.format(args.year) headers = {'User-Agent': 'Calculette-Impots-Python'} saisie_variables = {} if args.saisie_variables is None else dict(iter_saisie_variables(args.saisie_variables)) default_saisie_variables = { # '0DA': '1965', # '1AJ': '15000', 'pre_situation_famille': 'C', 'pre_situation_residence': 'M', # 'simplifie': '1', } data = merge(default_saisie_variables, saisie_variables) response = requests.post(cgi_url, headers=headers, data=data) if args.output_html is not None: with open(args.output_html, 'w') as output_html_file: output_html_file.write(re.sub( pattern=r'=(.)/calcul_impot/2015/', repl=r'=\1http://www3.finances.gouv.fr/calcul_impot/2015/', string=response.text, )) root_node = etree.fromstring(response.text, etree.HTMLParser()) results = list(iter_results(root_node)) print(json.dumps(results, ensure_ascii=False, indent=2, sort_keys=True)) return 0
def setup_document(self): # Determine the path of the fixture to load. filename = getattr(type(self), "__fixture__") fixture_path = os.path.join(_FIXTURE_DIR, filename) parser = etree.HTMLParser(encoding="UTF-8") # Open the fixture file in the browser. self.document = etree.parse(fixture_path, parser)
def __init__(self): self.parser = etree.HTMLParser()
def _request(self): """Makes requests to vegasinsider odds pages to get game odds Returns: dict: values are self._scrape() """ if not memcache.add(type(self).__name__, True, 3): time.sleep(3) logging.info('Scraping VegasInsider for %s' % (self.league)) url = "http://www.vegasinsider.com/%s/odds/las-vegas/" % (self.league) response = urlfetch.fetch(url) # time.sleep(3) # url = "http://www.vegasinsider.com/%s/odds/offshore/" % (self.vi_league) # response = urlfetch.fetch(url) # offshore_tree = etree.fromstring(response.content, etree.HTMLParser()) try: vegas_odds = self._scrape(response.content, 1) # offshore_odds = self._scrape(offshore_tree, 8) except IndexError as e: logging.exception(e) vegas_odds = {} return { 'vegas' : vegas_odds, # 'offshore' : offshore_odds }
def download_filelist_sync(cat, verbose=False): "Generate URLs for free O'Reilly ebooks in PDF format." url = 'http://shop.oreilly.com/category/ebooks/%s.do' % cat if verbose: print(url) p = etree.HTMLParser() t1 = etree.parse(url, parser=p) table_pag1 = t1.xpath('//table[@class="pagination"]')[0] xp = '//td[@class="default"]/select[@name="dirPage"]/option/@value' page_urls = set(table_pag1.xpath(xp)) for i, page_url in enumerate(page_urls): # if verbose: # print(page_url) t2 = etree.parse('http://shop.oreilly.com' + page_url, parser=p) xp = '//span[@class="price"][contains(., "$0.00")]/'\ '../../../../div[@class="thumbheader"]/a/@href' paths = t2.xpath(xp) for j, path in enumerate(paths): url = 'http://shop.oreilly.com' + path html = requests.get(url).text url_csps = re.findall('path_info\:\s+(.*?\.csp)', html) if len(url_csps) != 1: continue url_csp = url_csps[0] url_csp = re.sub('\?.*', '', url_csp) url_pdf = re.sub('\.csp', '.pdf', url_csp) url_pdf = re.sub('/free/', '/free/files/', url_pdf) u = 'http://www.oreilly.com/%s' % url_pdf if verbose: print(u) yield u
def get_as_etree(url): response = requests.get(url) parser = etree.HTMLParser() return etree.parse(StringIO(response.text), parser)
def get_global_olis_prices(): url = 'http://www.olis.is/solustadir/thjonustustodvar/eldsneytisverd/' res = requests.get(url, headers=utils.headers()) html = etree.fromstring(res.content, etree.HTMLParser()) bensin95_text = html.find('.//*[@id="gas-price"]/span[1]').text diesel_text = html.find('.//*[@id="gas-price"]/span[2]').text bensin_discount_text = html.find('.//*[@id="gas-price"]/span[4]').text diesel_discount_text = html.find('.//*[@id="gas-price"]/span[5]').text return { 'bensin95': float(bensin95_text.replace(',', '.')), 'diesel': float(diesel_text.replace(',', '.')), 'bensin95_discount': float(bensin_discount_text.replace(',', '.')), 'diesel_discount': float(diesel_discount_text.replace(',', '.')) }