我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用html5lib.HTMLParser()。
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, strip=False, strip_comments=True): """Clean an HTML fragment and return it""" if not text: return u'' text = force_unicode(text) if text.startswith(u'<!--'): text = u' ' + text class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s) return _render(parser.parseFragment(text)).strip()
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, strip=False, strip_comments=True): """Clean an HTML fragment and return it""" if not text: return '' text = force_unicode(text) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s) return _render(parser.parseFragment(text))
def taskTx(sock, message, mtype): # a poor implementation of an output coroutine. global revertProtocol tp = html5lib.getTreeBuilder("dom") p = html5lib.HTMLParser(tree=tp) tw = html5lib.getTreeWalker("dom") parsedTX = p.parseFragment(message) cleanTX = sanitizer.Filter(tw(parsedTX)) s = html5lib.serializer.HTMLSerializer() pretx = s.serialize(cleanTX) tx = '' for item in pretx: tx += item if message == b"200": await sock.send("Goodbye.") await sock.close() return if message == b"202": await sock.send("Authentication Successful, you are now the admin terminal.") else: if revertProtocol: await sock.send(tx) return else: await sock.send(json.dumps({"MSG_TYPE":mtype, "MSG":tx})) return
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
def delinkify(text, allow_domains=None, allow_relative=False): """Remove links from text, except those allowed to stay.""" text = force_unicode(text) if not text: return u'' parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer) forest = parser.parseFragment(text) if allow_domains is None: allow_domains = [] elif isinstance(allow_domains, basestring): allow_domains = [allow_domains] def delinkify_nodes(tree): """Remove <a> tags and replace them with their contents.""" for node in tree.childNodes: if node.name == 'a': if 'href' not in node.attributes: continue parts = urlparse.urlparse(node.attributes['href']) host = parts.hostname if any(_domain_match(host, d) for d in allow_domains): continue if host is None and allow_relative: continue # Replace the node with its children. # You can't nest <a> tags, and html5lib takes care of that # for us in the tree-building step. for n in node.childNodes: tree.insertBefore(n, node) tree.removeChild(node) elif node.type != NODE_TEXT: # Don't try to delinkify text. delinkify_nodes(node) delinkify_nodes(forest) return _render(forest)
def buildTestSuite(): for filename in html5lib_test_files("encoding"): test_name = os.path.basename(filename).replace('.dat',''). \ replace('-','') tests = TestData(filename, "data") for idx, test in enumerate(tests): def encodingTest(self, data=test['data'], encoding=test['encoding']): p = HTMLParser() t = p.parse(data, useChardet=False) errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"% (data, repr(encoding.lower()), repr(p.tokenizer.stream.charEncoding))) self.assertEquals(encoding.lower(), p.tokenizer.stream.charEncoding[0], errorMessage) setattr(Html5EncodingTestCase, 'test_%s_%d' % (test_name, idx+1), encodingTest) try: import chardet def test_chardet(self): data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read() encoding = inputstream.HTMLInputStream(data).charEncoding assert encoding[0].lower() == "big5" setattr(Html5EncodingTestCase, 'test_chardet', test_chardet) except ImportError: print "chardet not found, skipping chardet tests" return unittest.defaultTestLoader.loadTestsFromName(__name__)
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print "Raw lxml parsed the markup in %.2fs." % (b-a) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print "Raw html5lib parsed the markup in %.2fs." % (b-a)
def test_xml_render(): parser = html5lib.HTMLParser() eq_(bleach._render(parser.parseFragment('')), '')
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, unicode): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, basestring): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]