我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用html5lib.parse()。
def get_links(url,html): '''Ritorna i links in un HTML.''' try: links = [] document = html5lib.parse(html, namespaceHTMLElements=False) for link_elem in document.iter('a'): link = link_elem.get('href') # Manca l'url del link if not link: continue # ignoriamo referenze interne if '#' in link: continue # link locale, aggiungiamo l'url if '://' not in link: link = urllib.parse.urljoin(url, link) # ci sono ancora errori, saltiamo if '://' not in link: continue # normalizziamo url link = urllib.parse.urljoin(link,'.') links += [link] return links except: return []
def web_scraper(self, roots, match=None, session=None): """Yield URLs in a directory which start with `match`. If `match` is None, all links are yielded.""" for root in roots: if session is not None: response = session.get(root) else: response = requests.get(root) try: response.raise_for_status() except requests.HTTPError as exc: log.error("web_scraper error: %s raised %s", root, str(exc)) continue #source = ElementTree.fromstring(response.content) source = html5lib.parse(response.content, namespaceHTMLElements=False) a_tags = source.findall(".//a") for a in a_tags: url = a.attrib["href"] if match is None or url[slice(0, len(match))] == match: yield urljoin(root, url)
def web_scraper(roots, match=None, session=None): """Yield URLs in a directory which start with `match`. If `match` is None, all links are yielded.""" for root in roots: if session is not None: response = session.get(root) else: response = requests.get(root) try: response.raise_for_status() except requests.HTTPError as exc: log.error("web_scraper error: %s raised %s", root, str(exc)) continue #source = ElementTree.fromstring(response.content) source = html5lib.parse(response.content, namespaceHTMLElements=False) a_tags = source.findall(".//a") for a in a_tags: url = a.attrib["href"] if match is None or url[slice(0, len(match))] == match: yield urljoin(root, url)
def parse_new_details(self, root, mi, non_hero): table = non_hero.xpath('descendant::table')[0] for tr in table.xpath('descendant::tr'): cells = tr.xpath('descendant::td') if len(cells) == 2: name = self.totext(cells[0]) val = self.totext(cells[1]) if not val: continue if name in self.language_names: ans = self.lang_map.get(val, None) if not ans: ans = canonicalize_lang(val) if ans: mi.language = ans elif name in self.publisher_names: pub = val.partition(';')[0].partition('(')[0].strip() if pub: mi.publisher = pub date = val.rpartition('(')[-1].replace(')', '').strip() try: from calibre.utils.date import parse_only_date date = self.delocalize_datestr(date) mi.pubdate = parse_only_date(date, assume_utc=True) except: self.log.exception('Failed to parse pubdate: %s' % val) elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: ans = check_isbn(val) if ans: self.isbn = mi.isbn = ans
def parse_rss(url=None, **kwargs): try: f = fetch(decode(url), **kwargs) except (ValueError, URLError): parsed = rssparser.parse(url) else: content = f.read() if speedparser else f try: parsed = rssparser.parse(content) finally: f.close() return parsed
def xml2etree(f, xml=True, html5=False): if xml: element_tree = etree.parse(f) elif html5 and html5parser: element_tree = html5parser.parse(f) elif html5parser: element_tree = html.parse(f) else: # html5lib's parser returns an Element, so we must convert it into an # ElementTree element_tree = ElementTree(html.parse(f)) return element_tree
def html (html, baseurl, encoding = None): # html5lib rebuilds possibly mal-formed html try: if type (html) is str: return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (html, treebuilder="lxml")), baseurl) else: return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (html, likely_encoding = encoding, treebuilder="lxml")), baseurl) except ValueError: return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (to_str (html, encoding), treebuilder="lxml")), baseurl)
def etree (html, encoding = None): try: return html5lib.parse (html, encoding = encoding, treebuilder="lxml") except ValueError: return html5lib.parse (to_str (html, encoding), treebuilder="lxml")
def select_nodes(resp, selector): """Give a response from the app, return just the HTML fragment defined by `selector`""" h = html5lib.parse(resp.content.decode('utf-8'), treebuilder='lxml', namespaceHTMLElements=False) return h.getroot().cssselect(selector)
def get_player_rosters(league, season, results_array=None, multiple_teams=False): league = str(league) season = str(season) if results_array is None: results_array = [] player_ids = [] team_urls = [] """ Get the league link """ team_search_url = "http://www.eliteprospects.com/standings.php?league={0}&startdate={1}".format(league, str(int(season) - 1)) team_search_request = requests.get(team_search_url) # All tag names have this prepended to them html_prefix = '{http://www.w3.org/1999/xhtml}' team_search_page = html5lib.parse(team_search_request.text) # /html/body/div/table[3]/tbody/tr/td[5]/table[3] team_table = team_search_page.find( './{0}body/{0}div[2]/{0}div/{0}table[3]/{0}tbody/{0}tr/{0}td[5]/{0}table[3]'.format(html_prefix)) teams = team_table.findall('.//{0}tbody/{0}tr/{0}td[2]/{0}a'.format(html_prefix)) on_first_row = True for team in teams: if on_first_row: on_first_row = False continue team_urls.append(team.attrib['href']) """ Get the players """ for team_url in team_urls: teamroster.get_team_roster(team_url, season, player_ids, results_array, multiple_teams, league) return results_array
def bench_html5lib(html_file): html_file.seek(0) html5lib.parse(html_file)
def _egg_info_matches(self, egg_info, search_name, link): match = self._egg_info_re.search(egg_info) if not match: logger.debug('Could not parse version from link: %s' % link) return None name = match.group(0).lower() # To match the "safe" name that pkg_resources creates: name = name.replace('_', '-') # project name and version must be separated by a dash look_for = search_name.lower() + "-" if name.startswith(look_for): return match.group(0)[len(look_for):] else: return None
def __init__(self, content, url, headers=None, trusted=None): self.content = content self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False) self.url = url self.headers = headers self.trusted = trusted
def resolve(self, response): parsed = html5lib.parse(response.content) return BoundTag(self._driver, response, _strip_namespace(parsed))
def compile_file(filename, output): with open(filename, 'rb') as f: tree = html5lib.parse(f, treebuilder='dom') compile_tree(tree, output)
def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`mimetools.Message`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except ImportError: from html5lib import parse return parse( data, transport_encoding=transport_encoding, namespaceHTMLElements=False) else: return parse(data, transport_encoding=transport_encoding)
def process_document(self, document, path): if isinstance(document, basestring): document = StringIO(document) doc = html5lib.parse(document, treebuilder='lxml', namespaceHTMLElements=False) return self.process_tree(doc, path)
def get_file_url(modpack_url): try: with urllib.request.urlopen(modpack_url) as f: dom=html5lib.parse(f,"etree") except html5lib.html5parser.ParseError: print("WARNING: Error parsing modpack page for download url") except urllib.request.URLError: print("WARNING: Error downloading modpack page for download url") else: urls=[a.get("href") for a in dom.find(dom.tag[:-4]+"body").iter(dom.tag[:-4]+"a") if "Server Download" in " ".join(" ".join(a.itertext()).strip().split())] if len(urls)>=1: if len(urls)>1: print("WARNING: Multiple download urls found choosing first found.") return urls[0] return None
def parse_details_page(url, log, timeout, browser): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r'%url log.exception(msg) return oraw = raw raw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r'%url) return try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%url log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def fetch_raw(self, log, url, br, testing, # {{{ identifiers={}, timeout=30): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib try: raw = br.open_novisit(url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('DangDang timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r'%url log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='dangdang_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print ('Downloaded html for results page saved in', f.name) matches = [] found = '<title>?????????????????' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse DangDang page for query: %r'%url log.exception(msg) return msg return found, root
def fetch_data(user, pw, download_dir=None): # Fetch the index page to get a CSRF token. r = requests.get('https://www.runtastic.com/') if r.status_code != 200: raise 'Sucks' cookies = dict(r.cookies) doc = html5lib.parse(r.text, treebuilder='dom') csrf = get_csrf_token(doc) # Now log in. # user, pw = read_user_pass() login = dict(csrf) login['user[email]'] = user login['user[password]'] = pw login['grant_type'] = 'password' r2 = requests.post('https://www.runtastic.com/de/d/benutzer/sign_in', data=login)#, cookies=cookies) if r2.status_code != 200: raise 'Sucks 2' cookies.update(r2.cookies) print r2.content j = r2.json() if not j['success']: raise 'Login failed' doc = html5lib.parse(j['update'], treebuilder='dom') # Find the sport-sessions page and fetch it to get a User ID # and a list of session IDs. links = [l.getAttribute('href') for l in doc.getElementsByTagName('a') if l.getAttribute('href').endswith('/sport-sessions')] sessions_url = urlparse.urljoin(r2.url, links[0]) r3 = requests.get(sessions_url, cookies=cookies) if r3.status_code != 200: raise 'Sucks 3' cookies.update(r3.cookies) doc = html5lib.parse(r3.text, treebuilder='dom') uid = get_user_id(doc) data = get_data(doc) # Now hit the API to get data about each session. request_data = dict(csrf) request_data['user_id'] = uid request_data['items'] = ','.join(str(d[0]) for d in data) r4 = requests.post('https://www.runtastic.com/api/run_sessions/json', cookies=cookies, data=request_data) if r4.status_code != 200: raise 'Sucks 4' cookies.update(r4.cookies) sessions = r4.json() print sessions # known_sessions = read_known_sessions() # for s in sessions: # if s['id'] in known_sessions: # continue # if check_download_session(urlparse.urljoin(r4.url, s['page_url']) + '.tcx', download_dir, cookies): # known_sessions.add(s['id']) # write_known_sessions(known_sessions)
def update_search_index_for(report, index): # Find the most recent HTML text that we'll perform text matching on. text_fn = None text = None for version in reversed(report["versions"]): for versionformat in version["formats"]: if versionformat["format"] == "HTML": text_fn = os.path.join("reports", versionformat['filename']) if text_fn: try: with open(text_fn) as f: # Parse the page as HTML5. html5lib gives some warnings about malformed # content that we don't care about -- hide warnings. import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") dom = html5lib.parse(f.read(), treebuilder="lxml") # Convert to plain text. text = lxml.etree.tostring(dom, method='text', encoding=str) except FileNotFoundError: print("Missing HTML", report["number"], version["date"]) # There's a quote on the size of the index_data, 10KB minified JSON # according to the docs, although we seem to be able to push more # than that. Limit the amount of text we send up. if text: text = text[:20000] # Construct index data. index_data = { "objectID": ("crs:%s" % report["number"]), "type": report["type"], "reportNumber": report["number"], "title": report["versions"][0]["title"], "lastPubDate": report["versions"][0]["date"], "firstPubDate": report["versions"][-1]["date"], "lastPubYear": int(report["versions"][0]["date"][0:4]), "firstPubYear": int(report["versions"][-1]["date"][0:4]), "date": parse_dt(report["versions"][0]["date"]).strftime("%b. %-d, %Y"), "summary": report["versions"][0]["summary"][0:10000], "topics": report["topics"], "isUpdated": len(report["versions"]) > 1, "text": text, "url": "https://www.everycrsreport.com/reports/%s.html" % report["number"], } #print(json.dumps(index_data, indent=2)) #print() index.add_object(index_data, index_data["objectID"])
def process_tree(self, tree, path): docs = [] doc = {} root = tree.getroot() head = root.find('head') if head is None: raise ProcessingError('Document does not parse correctly.') title = head.find('title') doc['path'] = path doc['title'] = self.process_title_tag(title) priority = str(path).split("/")[0] if priority and priority in self.content_scoring: doc['priority'] = int(self.content_scoring[priority]) else: doc['priority'] = 0 buf = [] for sel in self.content_selectors: for el in sel(root): buf.append(self.process_content_tag(el)) doc['text'] = u''.join(buf).rstrip() docs.append(doc) for sel in self.content_sections: for el in sel(root): if el.attrib['id'] and el.attrib['id'] not in path: p = str(path).split("/")[0] if p and p in self.content_scoring: priority = int(self.content_scoring[p]) else: priority = 0 title = [w.capitalize() for w in el.attrib['id'].split("-")] docs.append({ 'path': path + "#" + el.attrib['id'], 'title': u' '.join(title), 'text': self.process_content_tag(el), 'priority': priority + 1 }) return docs