我们从Python开源项目中,提取了以下25个代码示例,用于说明如何使用HTMLParser.HTMLParser.feed()。
def remove(self, item): """ This is as list.remove but works with id. data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): if ind.name == 'b': root.remove(ind) print dom It should print. <a ></a> """ index = self.index(item) del self[index]
def take(self, *args): """ It returns the first object whose one of its attributes matches (key0, value0), (key1, value1), ... . Example: data = '<a><b id="foo" size="1"></b></a>' html = Html() dom = html.feed(data) print dom.take(('id', 'foo')) print dom.take(('id', 'foo'), ('size', '2')) """ seq = self.match(*args) try: item = seq.next() except StopIteration: return None else: return item
def walk_with_root(self): """ Like walk but carries root. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) for (root, name, attr), (ind, name, attr) in dom.walk_with_root(): print root, name, ind, name Output: <em >alpha</em> 1 alpha 1 <body ><em >alpha</em></body> em <em >alpha</em> em <body ><em >alpha</em></body> body <body ><em >alpha</em></body> body """ for root, ind in self.sail_with_root(): yield ((root, root.name, root.attr), (ind, ind.name, ind.attr))
def __init__(self, data): """ The data holds the characters. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) x = dom.fst('em') x.append(Data('\nbeta')) It outputs. <body ><em >alpha beta</em></body> """ Root.__init__(self, DATA) self.data = data
def feed(self, data): self.reset() HTMLParser.feed(self, data)
def feed(self, chars): # [8] if self.phase in [self.TERMINATED, self.FOUND]: self._terminate() return HTMLParser.feed(self, chars)
def findHTMLMeta(stream): """Look for a meta http-equiv tag with the YADIS header name. @param stream: Source of the html text @type stream: Object that implements a read() method that works like file.read @return: The URI from which to fetch the XRDS document @rtype: str @raises MetaNotFound: raised with the content that was searched as the first parameter. """ parser = YadisHTMLParser() chunks = [] while 1: chunk = stream.read(CHUNK_SIZE) if not chunk: # End of file break chunks.append(chunk) try: parser.feed(chunk) except HTMLParseError, why: # HTML parse error, so bail chunks.append(stream.read()) break except ParseDone, why: uri = why[0] if uri is None: # Parse finished, but we may need the rest of the file chunks.append(stream.read()) break else: return uri content = ''.join(chunks) raise MetaNotFound(content)
def feed(self, in_html): self.output = "" HTMLParser.feed(self, in_html) return self.output
def html_to_md(h): p = MyHTMLParser() return p.feed(h)
def sail(self): """ This is used to navigate through the xml/html document. Every xml/html object is represented by a python class instance that inherits from Root. The method sail is used to return an iterator for these objects. Example: data = '<a> <b> </b> </a>' html = Html() dom = html.feed(data) for ind in dom.sail(): print type(ind),',', ind.name It would output. <class 'ehp.Root'> , a <class 'ehp.Root'> , b """ for indi in self[:]: for indj in indi.sail(): yield(indj) yield(indi)
def index(self, item): """ This is similar to index but uses id to check for equality. Example: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): print root.name, ind.name, root.index(ind) It would print. a b 0 a b 1 a 0 The line where it appears ' a 0' corresponds to the outmost object. The outmost object is an instance of Root that contains all the other objects. """ count = 0 for ind in self: if ind is item: return count count = count + 1 raise ValueError
def find(self, name, *args): """ It is used to find all objects that match name. Example 1: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for ind in dom.find('b'): print ind It should print. <b ></b> <b ></b> Example 2. data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind Output. <p style="color:green" > beta.</p> """ for ind in self.sail(): if ind.name == name: for key, value in args: if ind.attr[key] != value: break else: yield(ind)
def find_with_root(self, name, *args): """ Like Root.find but returns its parent tag. from ehp import * html = Html() dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''') for root, ind in dom.find_with_root('p'): root.remove(ind) print dom It would output. <body > </body> """ for root, ind in self.sail_with_root(): if ind.name == name: for key, value in args: if ind.attr[key] != value: break else: yield(root, ind)
def match(self, *args): """ It returns a sequence of objects whose attributes match. (key0, value0), (key1, value1), ... . Example: data = '<a size="1"><b size="1"></b></a>' html = Html() dom = html.feed(data) for ind in dom.match(('size', '1')): print ind It would print. <b size="1" ></b> <a size="1" ><b size="1" ></b></a> """ for ind in self.sail(): for key, value in args: if ind.attr[key] != value: break else: yield(ind)
def match_with_root(self, *args): """ Like Root.match but with its parent tag. Example: from ehp import * html = Html() dom = html.feed('''<body> <p style="color:black"> xxx </p> <p style = "color:black"> mmm </p></body>''') for root, ind in dom.match_with_root(('style', 'color:black')): del ind.attr['style'] item = dom.fst('body') item.attr['style'] = 'color:black' print dom Output. <body style="color:black" > <p > xxx </p> <p > mmm </p></body> """ for root, ind in self.sail_with_root(): for key, value in args: if ind.attr[key] != value: break else: yield(root, ind)
def text(self): """ It returns all objects whose name matches DATA. It basically returns a string corresponding to all asci characters that are inside a xml/html tag. Example: html = Html() data = '<body><em>This is all the text.</em></body>' dom = html.feed(data) print dom.fst('em').text() It outputs. This is all the text. Notice that if you call text() on an item with children then it returns all the *printable* characters for that node. """ return self.join('', DATA)
def walk(self): """ Like sail but carries name and attr. Example: html = Html() data = '<body> <em> This is all the text.</em></body>' dom = html.feed(data) for ind, name, attr in dom.walk(): print 'TAG:', ind print 'NAME:', name print 'ATTR:', attr It should print. TAG: NAME: 1 ATTR: TAG: This is all the text. NAME: 1 ATTR: TAG: <em > This is all the text.</em> NAME: em ATTR: TAG: <body > <em > This is all the text.</em></body> NAME: body ATTR: """ for ind in self.sail(): yield (ind, ind.name, ind.attr)
def fromfile(self, filename): """ It builds a structure from a file. """ fd = open(fname, 'r') data = fd.read() fd.close() return self.feed(data)
def join(self, delim, *args): """ It joins all the objects whose name appears in args. Example 1: html = Html() data = '<a><b> This is cool. </b><b> That is. </b></a>' dom = html.feed(data) print dom.join('', 'b') print type(dom.join('b')) It would print. <b > This is cool. </b><b > That is. </b> <type 'str'> Example 2: html = Html() data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>' dom = html.feed(data) print dom.join('', 'b', 'c') It would print. <b > alpha</b><c >beta</c><b >gamma</b> Example 3: html = Html() data = '<a><b>alpha</b><c>beta</c><b>gamma</a>' dom = html.feed(data) print dom.join('\n', DATA) It would print. alpha beta gamma """ data = '' for ind in self.sail(): if ind.name in args: data = '%s%s%s' % (data, delim, ind) return data
def fst(self, name, *args): """ It returns the first object whose name matches. Example 1: html = Html() data = '<body> <em> Cool. </em></body>' dom = html.feed(data) print dom.fst('em') It outputs. <em > Cool. </em> Example 2: data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind print dom.fst('p', ('style', 'color:green')) print dom.fst_with_root('p', ('style', 'color:green')) Output: <p style="color:green" > beta.</p> <p style="color:green" > beta.</p> (<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>) """ # for ind in self.sail(): # if ind.name == name: # for key, value in args: # if ind.attr[key] != value: # break # else: # return ind seq = self.find(name, *args) try: item = seq.next() except StopIteration: return None else: return item