我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用lxml.etree.ParserError()。
def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) try: self.parser = self.parser_for(self.soup.original_encoding) self.parser.feed(data) while len(data) != 0: # Now call feed() on the rest of the data, chunk by chunk. data = markup.read(self.CHUNK_SIZE) if len(data) != 0: self.parser.feed(data) self.parser.close() except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e))
def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) elif isinstance(markup, unicode): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) try: self.parser = self.parser_for(self.soup.original_encoding) self.parser.feed(data) while len(data) != 0: # Now call feed() on the rest of the data, chunk by chunk. data = markup.read(self.CHUNK_SIZE) if len(data) != 0: self.parser.feed(data) self.parser.close() except (UnicodeDecodeError, LookupError, etree.ParserError), e: raise ParserRejectedMarkup(str(e))
def extract_html_content(self, html_body, fix_html=True): """Ingestor implementation.""" if html_body is None: return try: try: doc = html.fromstring(html_body) except ValueError: # Ship around encoding declarations. # https://stackoverflow.com/questions/3402520 html_body = self.RE_XML_ENCODING.sub('', html_body, count=1) doc = html.fromstring(html_body) except (ParserError, ParseError, ValueError): raise ProcessingException("HTML could not be parsed.") self.extract_html_header(doc) self.cleaner(doc) text = self.extract_html_text(doc) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
def ingest(self, file_path): """Ingestor implementation.""" file_size = self.result.size or os.path.getsize(file_path) if file_size > self.MAX_SIZE: raise ProcessingException("XML file is too large.") try: doc = etree.parse(file_path) except (ParserError, ParseError): raise ProcessingException("XML could not be parsed.") text = self.extract_html_text(doc.getroot()) transform = etree.XSLT(self.XSLT) html_doc = transform(doc) html_body = html.tostring(html_doc, encoding='unicode', pretty_print=True) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
def check_form(body): """ Check the presence of forms action. Args: body (string): body to check Returns: boolean True if there is form """ # http://lxml.de/parsing.html - Python unicode strings body = body.encode("utf-8") if body.strip(): try: tree = html.fromstring(body) except ParserError: return False else: results = tree.xpath('//form') if results: return True return False