我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用BeautifulSoup.Comment()。
def getNodeDetails(self, node): if isinstance(node, BeautifulSoup): # Document or DocumentFragment return (_base.DOCUMENT,) elif isinstance(node, Declaration): # DocumentType string = unicode(node.string) #Slice needed to remove markup added during unicode conversion, #but only in some versions of BeautifulSoup/Python if string.startswith('<!') and string.endswith('>'): string = string[2:-1] m = self.doctype_regexp.match(string) #This regexp approach seems wrong and fragile #but beautiful soup stores the doctype as a single thing and we want the seperate bits #It should work as long as the tree is created by html5lib itself but may be wrong if it's #been modified at all #We could just feed to it a html5lib tokenizer, I guess... assert m is not None, "DOCTYPE did not match expected format" name = m.group('name') publicId = m.group('publicId') if publicId is not None: systemId = m.group('systemId1') else: systemId = m.group('systemId2') return _base.DOCTYPE, name, publicId or "", systemId or "" elif isinstance(node, Comment): string = unicode(node.string) if string.startswith('<!--') and string.endswith('-->'): string = string[4:-3] return _base.COMMENT, string elif isinstance(node, unicode): # TextNode return _base.TEXT, node elif isinstance(node, Tag): # Element return (_base.ELEMENT, namespaces["html"], node.name, dict(node.attrs).items(), node.contents) else: return _base.UNKNOWN, node.__class__.__name__
def commentClass(self, data): return TextNode(Comment(data), self.soup)
def expand_html(html, cdict=None): if not have_soup: raise RuntimeError("Missing BeautifulSoup") soup = BeautifulSoup(html) comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] for txt in soup.findAll(text=True): if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'): ntxt = regex_link.sub( lambda match: expand_one(match.group(0), cdict), txt) txt.replaceWith(BeautifulSoup(ntxt)) return str(soup)
def testSerializer(element): import re rv = [] def serializeElement(element, indent=0): if isinstance(element, Declaration): doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?' m = re.compile(doctype_regexp).match(element.string) assert m is not None, "DOCTYPE did not match expected format" name = m.group('name') publicId = m.group('publicId') if publicId is not None: systemId = m.group('systemId1') or "" else: systemId = m.group('systemId2') if publicId is not None or systemId is not None: rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""% (' '*indent, name, publicId or "", systemId or "")) else: rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name)) elif isinstance(element, BeautifulSoup): if element.name == "[document_fragment]": rv.append("#document-fragment") else: rv.append("#document") elif isinstance(element, Comment): rv.append("|%s<!-- %s -->"%(' '*indent, element.string)) elif isinstance(element, unicode): rv.append("|%s\"%s\"" %(' '*indent, element)) else: rv.append("|%s<%s>"%(' '*indent, element.name)) if element.attrs: for name, value in sorted(element.attrs): rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) indent += 2 if hasattr(element, "contents"): for child in element.contents: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv)