我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用HTMLParser.HTMLParseError()。
def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) data = self.rawdata[i+9:k] j = k+3 self._toStringSubclass(data, CData) else: try: j = HTMLParser.parse_declaration(self, i) except HTMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e parser.already_closed_empty_element = [] # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
def feeds(page_url): """Search the given URL for possible feeds, returning a list of them.""" # If the URL is a feed, there's no need to scan it for links. if is_feed(page_url): return [page_url] data = fetch_url(page_url) parser = FeedFinder(page_url) try: parser.feed(data) except HTMLParser.HTMLParseError: pass found = parser.urls() # Return only feeds that feedparser can understand. return [feed for feed in found if is_feed(feed)]
def http_response(self, request, response): if not hasattr(response, "seek"): response = response_seek_wrapper(response) http_message = response.info() url = response.geturl() ct_hdrs = http_message.getheaders("content-type") if is_html(ct_hdrs, url, self._allow_xhtml): try: try: html_headers = parse_head(response, self.head_parser_class()) finally: response.seek(0) except (HTMLParser.HTMLParseError, sgmllib.SGMLParseError): pass else: for hdr, val in html_headers: # add a header http_message.dict[hdr.lower()] = val text = hdr + ": " + val for line in text.split("\n"): http_message.headers.append(line + "\n") return response
def dehtml(text): try: parser = _DeHTMLParser() parser.feed(text.encode(UTF8)) parser.close() return parser.text() except HTMLParseError: from traceback import print_exc print_exc(file=sys.stderr) return text # Format a key value list # key, value -> "key: value" + ", " if not last item # key, '' -> "key:" + ", " if not last item # key, None -> "key" + " " if not last item
def findHTMLMeta(stream): """Look for a meta http-equiv tag with the YADIS header name. @param stream: Source of the html text @type stream: Object that implements a read() method that works like file.read @return: The URI from which to fetch the XRDS document @rtype: str @raises MetaNotFound: raised with the content that was searched as the first parameter. """ parser = YadisHTMLParser() chunks = [] while 1: chunk = stream.read(CHUNK_SIZE) if not chunk: # End of file break chunks.append(chunk) try: parser.feed(chunk) except HTMLParseError, why: # HTML parse error, so bail chunks.append(stream.read()) break except ParseDone, why: uri = why[0] if uri is None: # Parse finished, but we may need the rest of the file chunks.append(stream.read()) break else: return uri content = ''.join(chunks) raise MetaNotFound(content)
def _parse_error(self, source): def parse(source=source): parser = HTMLParser.HTMLParser() parser.feed(source) parser.close() self.assertRaises(HTMLParser.HTMLParseError, parse)
def parse(self): """Generator that parses the HTML source, yielding markup events. :return: a markup event stream :raises ParseError: if the HTML text is not well formed """ def _generate(): if self.encoding: reader = codecs.getreader(self.encoding) source = reader(self.source) else: source = self.source try: bufsize = 4 * 1024 # 4K done = False while 1: while not done and len(self._queue) == 0: data = source.read(bufsize) if not data: # end of data self.close() done = True else: if not isinstance(data, unicode): raise UnicodeError("source returned bytes, but no encoding specified") self.feed(data) for kind, data, pos in self._queue: yield kind, data, pos self._queue = [] if done: open_tags = self._open_tags open_tags.reverse() for tag in open_tags: yield END, QName(tag), pos break except html.HTMLParseError, e: msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) raise ParseError(msg, self.filename, e.lineno, e.offset) return Stream(_generate()).filter(_coalesce)
def test_namespaced_system_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self): self.assertRaises(HTMLParseError, self._test_doctype, 'xsl:stylesheet PUBLIC "htmlent.dtd"')