Python html5lib 模块，parse() 实例源码

我们从Python开源项目中，提取了以下33个代码示例，用于说明如何使用html5lib.parse()。

项目：fondamentibook 作者：xelatihy | 项目源码 | 文件源码

def get_links(url,html):
    '''Ritorna i links in un HTML.'''
    try:
        links = []
        document = html5lib.parse(html,
            namespaceHTMLElements=False)
        for link_elem in document.iter('a'):
            link = link_elem.get('href')
            # Manca l'url del link
            if not link: continue       
            # ignoriamo referenze interne
            if '#' in link: continue    
            # link locale, aggiungiamo l'url
            if '://' not in link:       
                link = urllib.parse.urljoin(url,
                    link)
            # ci sono ancora errori, saltiamo
            if '://' not in link: continue
            # normalizziamo url
            link = urllib.parse.urljoin(link,'.') 
            links += [link]
        return links
    except:
        return []

项目：mlb_stats_spark 作者：dondrake | 项目源码 | 文件源码

def web_scraper(self, roots, match=None, session=None):
        """Yield URLs in a directory which start with `match`.
        If `match` is None, all links are yielded."""
        for root in roots:
            if session is not None:
                response = session.get(root)
            else:
                response = requests.get(root)
            try:
                response.raise_for_status()
            except requests.HTTPError as exc:
                log.error("web_scraper error: %s raised %s", root, str(exc))
                continue

            #source = ElementTree.fromstring(response.content)
            source = html5lib.parse(response.content, namespaceHTMLElements=False)
            a_tags = source.findall(".//a")
            for a in a_tags:
                url = a.attrib["href"]
                if match is None or url[slice(0, len(match))] == match:
                    yield urljoin(root, url)

项目：mlb_stats_spark 作者：dondrake | 项目源码 | 文件源码

def web_scraper(roots, match=None, session=None):
    """Yield URLs in a directory which start with `match`.
    If `match` is None, all links are yielded."""
    for root in roots:
        if session is not None:
            response = session.get(root)
        else:
            response = requests.get(root)
        try:
            response.raise_for_status()
        except requests.HTTPError as exc:
            log.error("web_scraper error: %s raised %s", root, str(exc))
            continue

        #source = ElementTree.fromstring(response.content)
        source = html5lib.parse(response.content, namespaceHTMLElements=False)
        a_tags = source.findall(".//a")
        for a in a_tags:
            url = a.attrib["href"]
            if match is None or url[slice(0, len(match))] == match:
                yield urljoin(root, url)

项目：calibre_dangdang 作者：qunxyz | 项目源码 | 文件源码

def parse_new_details(self, root, mi, non_hero):
        table = non_hero.xpath('descendant::table')[0]
        for tr in table.xpath('descendant::tr'):
            cells = tr.xpath('descendant::td')
            if len(cells) == 2:
                name = self.totext(cells[0])
                val = self.totext(cells[1])
                if not val:
                    continue
                if name in self.language_names:
                    ans = self.lang_map.get(val, None)
                    if not ans:
                        ans = canonicalize_lang(val)
                    if ans:
                        mi.language = ans
                elif name in self.publisher_names:
                    pub = val.partition(';')[0].partition('(')[0].strip()
                    if pub:
                        mi.publisher = pub
                    date = val.rpartition('(')[-1].replace(')', '').strip()
                    try:
                        from calibre.utils.date import parse_only_date
                        date = self.delocalize_datestr(date)
                        mi.pubdate = parse_only_date(date, assume_utc=True)
                    except:
                        self.log.exception('Failed to parse pubdate: %s' % val)
                elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:
                    ans = check_isbn(val)
                    if ans:
                        self.isbn = mi.isbn = ans

项目：riko 作者：nerevu | 项目源码 | 文件源码

def parse_rss(url=None, **kwargs):
    try:
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
    else:
        content = f.read() if speedparser else f

        try:
            parsed = rssparser.parse(content)
        finally:
            f.close()

    return parsed

项目：riko 作者：nerevu | 项目源码 | 文件源码

def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree

项目：aquests 作者：hansroh | 项目源码 | 文件源码

def html (html, baseurl, encoding = None):
    # html5lib rebuilds possibly mal-formed html    
    try:        
        if type (html) is str:
            return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (html, treebuilder="lxml")), baseurl)
        else:   
            return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (html, likely_encoding = encoding, treebuilder="lxml")), baseurl)
    except ValueError:
        return lxml.html.fromstring (lxml.etree.tostring (html5lib.parse (to_str (html, encoding), treebuilder="lxml")), baseurl)

项目：aquests 作者：hansroh | 项目源码 | 文件源码

def etree (html, encoding = None):
    try:
        return html5lib.parse (html, encoding = encoding, treebuilder="lxml")   
    except ValueError:  
        return html5lib.parse (to_str (html, encoding), treebuilder="lxml")

项目：open-ledger 作者：creativecommons | 项目源码 | 文件源码

def select_nodes(resp, selector):
    """Give a response from the app, return just the HTML fragment defined by `selector`"""
    h = html5lib.parse(resp.content.decode('utf-8'), treebuilder='lxml', namespaceHTMLElements=False)
    return h.getroot().cssselect(selector)

项目：CanucksArmy 作者：dsgkirkby | 项目源码 | 文件源码

def get_player_rosters(league, season, results_array=None, multiple_teams=False):
    league = str(league)
    season = str(season)

    if results_array is None:
        results_array = []
    player_ids = []
    team_urls = []

    """ Get the league link """

    team_search_url = "http://www.eliteprospects.com/standings.php?league={0}&startdate={1}".format(league, str(int(season) - 1))
    team_search_request = requests.get(team_search_url)

    # All tag names have this prepended to them
    html_prefix = '{http://www.w3.org/1999/xhtml}'
    team_search_page = html5lib.parse(team_search_request.text)
    # /html/body/div/table[3]/tbody/tr/td[5]/table[3]
    team_table = team_search_page.find(
        './{0}body/{0}div[2]/{0}div/{0}table[3]/{0}tbody/{0}tr/{0}td[5]/{0}table[3]'.format(html_prefix))

    teams = team_table.findall('.//{0}tbody/{0}tr/{0}td[2]/{0}a'.format(html_prefix))

    on_first_row = True

    for team in teams:
        if on_first_row:
            on_first_row = False
            continue
        team_urls.append(team.attrib['href'])

    """ Get the players """

    for team_url in team_urls:
        teamroster.get_team_roster(team_url, season, player_ids, results_array, multiple_teams, league)

    return results_array

项目：performance 作者：python | 项目源码 | 文件源码

def bench_html5lib(html_file):
    html_file.seek(0)
    html5lib.parse(html_file)

项目：Callandtext 作者：iaora | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：Callandtext 作者：iaora | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：python_ddd_flask 作者：igorvinnicius | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：python_ddd_flask 作者：igorvinnicius | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：activesoup 作者：jelford | 项目源码 | 文件源码

def resolve(self, response):
        parsed = html5lib.parse(response.content)
        return BoundTag(self._driver, response, _strip_namespace(parsed))

项目：python-magery 作者：caolan | 项目源码 | 文件源码

def compile_file(filename, output):
    with open(filename, 'rb') as f:
        tree = html5lib.parse(f, treebuilder='dom')
    compile_tree(tree, output)

项目：Sudoku-Solver 作者：ayush1997 | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：Sudoku-Solver 作者：ayush1997 | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：youtube-trending-music 作者：ishan-nitj | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：youtube-trending-music 作者：ishan-nitj | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：mechanize 作者：python-mechanize | 项目源码 | 文件源码

def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`mimetools.Message`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except ImportError:
        from html5lib import parse
        return parse(
            data,
            transport_encoding=transport_encoding,
            namespaceHTMLElements=False)
    else:
        return parse(data, transport_encoding=transport_encoding)

项目：rigidsearch 作者：getsentry | 项目源码 | 文件源码

def process_document(self, document, path):
        if isinstance(document, basestring):
            document = StringIO(document)
        doc = html5lib.parse(document, treebuilder='lxml',
                             namespaceHTMLElements=False)
        return self.process_tree(doc, path)

项目：AlphaGSM 作者：SectorAlpha | 项目源码 | 文件源码

def get_file_url(modpack_url):
    try:
        with urllib.request.urlopen(modpack_url) as f:
            dom=html5lib.parse(f,"etree")
    except html5lib.html5parser.ParseError:
        print("WARNING: Error parsing modpack page for download url")
    except urllib.request.URLError:
        print("WARNING: Error downloading modpack page for download url")
    else:
        urls=[a.get("href") for a in dom.find(dom.tag[:-4]+"body").iter(dom.tag[:-4]+"a") if "Server Download" in " ".join(" ".join(a.itertext()).strip().split())]
        if len(urls)>=1:
            if len(urls)>1:
                print("WARNING: Multiple download urls found choosing first found.")
            return urls[0]
    return None

项目：MyFriend-Rob 作者：lcheniv | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：MyFriend-Rob 作者：lcheniv | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：twitter_word_count 作者：prrateekk | 项目源码 | 文件源码

def _egg_info_matches(self, egg_info, search_name, link):
        match = self._egg_info_re.search(egg_info)
        if not match:
            logger.debug('Could not parse version from link: %s' % link)
            return None
        name = match.group(0).lower()
        # To match the "safe" name that pkg_resources creates:
        name = name.replace('_', '-')
        # project name and version must be separated by a dash
        look_for = search_name.lower() + "-"
        if name.startswith(look_for):
            return match.group(0)[len(look_for):]
        else:
            return None

项目：twitter_word_count 作者：prrateekk | 项目源码 | 文件源码

def __init__(self, content, url, headers=None, trusted=None):
        self.content = content
        self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
        self.trusted = trusted

项目：calibre_dangdang 作者：qunxyz | 项目源码 | 文件源码

def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector

项目：calibre_dangdang 作者：qunxyz | 项目源码 | 文件源码

def fetch_raw(self, log, url, br, testing,  # {{{
                  identifiers={}, timeout=30):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        from lxml.html import tostring
        import html5lib
        try:
            raw = br.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                            e.getcode() == 404:
                log.error('Query malformed: %r'%url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = _('DangDang timed out. Try again later.')
                log.error(msg)
            else:
                msg = 'Failed to make identify query: %r'%url
                log.exception(msg)
            return as_unicode(msg)

        raw = clean_ascii_chars(xml_to_unicode(raw,
                                               strip_encoding_pats=True, resolve_entities=True)[0])

        if testing:
            import tempfile
            with tempfile.NamedTemporaryFile(prefix='dangdang_results_',
                                             suffix='.html', delete=False) as f:
                f.write(raw.encode('utf-8'))
            print ('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>?????????????????' not in raw

        if found:
            try:
                root = html5lib.parse(raw, treebuilder='lxml',
                                      namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse DangDang page for query: %r'%url
                log.exception(msg)
                return msg

        return found, root

项目：centr 作者：ibiBgOR | 项目源码 | 文件源码

def fetch_data(user, pw, download_dir=None):
    # Fetch the index page to get a CSRF token.
    r = requests.get('https://www.runtastic.com/')
    if r.status_code != 200:
        raise 'Sucks'
    cookies = dict(r.cookies)
    doc = html5lib.parse(r.text, treebuilder='dom')
    csrf = get_csrf_token(doc)
    # Now log in.
    # user, pw = read_user_pass()
    login = dict(csrf)
    login['user[email]'] = user
    login['user[password]'] = pw
    login['grant_type'] = 'password'
    r2 = requests.post('https://www.runtastic.com/de/d/benutzer/sign_in', data=login)#, cookies=cookies)
    if r2.status_code != 200:
        raise 'Sucks 2'
    cookies.update(r2.cookies)
    print r2.content
    j = r2.json()
    if not j['success']:
        raise 'Login failed'
    doc = html5lib.parse(j['update'], treebuilder='dom')
    # Find the sport-sessions page and fetch it to get a User ID
    # and a list of session IDs.
    links = [l.getAttribute('href') for l in doc.getElementsByTagName('a') if l.getAttribute('href').endswith('/sport-sessions')]
    sessions_url = urlparse.urljoin(r2.url, links[0])
    r3 = requests.get(sessions_url, cookies=cookies)
    if r3.status_code != 200:
        raise 'Sucks 3'
    cookies.update(r3.cookies)
    doc = html5lib.parse(r3.text, treebuilder='dom')
    uid = get_user_id(doc)
    data = get_data(doc)
    # Now hit the API to get data about each session.
    request_data = dict(csrf)
    request_data['user_id'] = uid
    request_data['items'] = ','.join(str(d[0]) for d in data)
    r4 = requests.post('https://www.runtastic.com/api/run_sessions/json',
                       cookies=cookies,
                       data=request_data)
    if r4.status_code != 200:
        raise 'Sucks 4'
    cookies.update(r4.cookies)
    sessions = r4.json()
    print sessions
    # known_sessions = read_known_sessions()
    # for s in sessions:
    #    if s['id'] in known_sessions:
    #        continue
    #    if check_download_session(urlparse.urljoin(r4.url, s['page_url']) + '.tcx', download_dir, cookies):
    #        known_sessions.add(s['id'])
    # write_known_sessions(known_sessions)

项目：crs-reports-website 作者：JoshData | 项目源码 | 文件源码

def update_search_index_for(report, index):
    # Find the most recent HTML text that we'll perform text matching on.
    text_fn = None
    text = None
    for version in reversed(report["versions"]):
        for versionformat in version["formats"]:
            if versionformat["format"] == "HTML":
                text_fn = os.path.join("reports", versionformat['filename'])
    if text_fn:
        try:
            with open(text_fn) as f:
                # Parse the page as HTML5. html5lib gives some warnings about malformed
                # content that we don't care about -- hide warnings.
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    dom = html5lib.parse(f.read(), treebuilder="lxml")

                # Convert to plain text.
                text = lxml.etree.tostring(dom, method='text', encoding=str)
        except FileNotFoundError:
            print("Missing HTML", report["number"], version["date"])

    # There's a quote on the size of the index_data, 10KB minified JSON
    # according to the docs, although we seem to be able to push more
    # than that. Limit the amount of text we send up.
    if text:
        text = text[:20000]

    # Construct index data.
    index_data = {
        "objectID": ("crs:%s" % report["number"]),
        "type": report["type"],
        "reportNumber": report["number"],
        "title": report["versions"][0]["title"],
        "lastPubDate": report["versions"][0]["date"],
        "firstPubDate": report["versions"][-1]["date"],
        "lastPubYear": int(report["versions"][0]["date"][0:4]),
        "firstPubYear": int(report["versions"][-1]["date"][0:4]),
        "date": parse_dt(report["versions"][0]["date"]).strftime("%b. %-d, %Y"),
        "summary": report["versions"][0]["summary"][0:10000],
        "topics": report["topics"],
        "isUpdated": len(report["versions"]) > 1,
        "text": text,
        "url": "https://www.everycrsreport.com/reports/%s.html" % report["number"],
    }

    #print(json.dumps(index_data, indent=2))
    #print()

    index.add_object(index_data, index_data["objectID"])

项目：rigidsearch 作者：getsentry | 项目源码 | 文件源码

def process_tree(self, tree, path):
        docs = []
        doc = {}

        root = tree.getroot()
        head = root.find('head')
        if head is None:
            raise ProcessingError('Document does not parse correctly.')

        title = head.find('title')
        doc['path'] = path
        doc['title'] = self.process_title_tag(title)
        priority = str(path).split("/")[0]
        if priority and priority in self.content_scoring:
            doc['priority'] = int(self.content_scoring[priority])
        else:
            doc['priority'] = 0

        buf = []
        for sel in self.content_selectors:
            for el in sel(root):
                buf.append(self.process_content_tag(el))

        doc['text'] = u''.join(buf).rstrip()
        docs.append(doc)

        for sel in self.content_sections:
            for el in sel(root):
                if el.attrib['id'] and el.attrib['id'] not in path:
                    p = str(path).split("/")[0]
                    if p and p in self.content_scoring:
                        priority = int(self.content_scoring[p])
                    else:
                        priority = 0
                    title = [w.capitalize() for w in el.attrib['id'].split("-")]
                    docs.append({
                        'path': path + "#" + el.attrib['id'],
                        'title': u' '.join(title),
                        'text': self.process_content_tag(el),
                        'priority': priority + 1
                    })
        return docs