我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用lxml.etree.LxmlError()。
def select_elements(doc, select): """ Return the elements within ``doc`` that match the selector ``select``. The selector can be an index, a CSS selector, or an XPath expression. """ try: int(select) elements = doc.xpath("(//table)[{}]".format(int(select))) except ValueError: # Expression wasn't a valid integer so try to use it as a CSS selector. try: elements = doc.cssselect(select) except SelectorError: # Nope, not a valid CSS expression. Last attempt is to try it as an # Path expression. try: elements = doc.xpath(select) except LxmlError: # Catch the specific LXML error and raise a more generic error # because the problem could lie with any of the index, CSS # selector, or XPath expression. raise ValueError("'{}' not an index, CSS selector, or XPath " "expression".format(select)) return elements
def extract(pagedata, pagefile): ret = [] try: root = etree.HTML(pagedata) except etree.LxmlError as e: log.error('<%s> malformed: %s', pagefile, e) return ret for table in root.findall('.//table[@class="mirrortable"]'): country = None for e in table.iter('a', 'td', 'th'): if e.tag == 'th' and e.text: country = e.text elif e.tag == 'a' and e.text.startswith('http'): url = e.get('href') if not url.endswith('/'): url += '/' ret.append((url, country)) return ret
def from_xml(self, content, forbid_dtd=True, forbid_entities=True): """ Given some XML data, returns a Python dictionary of the decoded data. By default XML entity declarations and DTDs will raise a BadRequest exception content but subclasses may choose to override this if necessary. """ if lxml is None: raise ImproperlyConfigured("Usage of the XML aspects requires lxml and defusedxml.") try: parsed = parse_xml(StringIO(content), forbid_dtd=forbid_dtd, forbid_entities=forbid_entities) except (LxmlError, DefusedXmlException): raise BadRequest return self.from_etree(parsed.getroot())
def parse_html(html_file): """ Read the HTML file using lxml's HTML parser, but convert to Unicode using Beautiful Soup's UnicodeDammit class. Can raise LxmlError or TypeError if the file can't be opened or parsed. """ unicode_html = UnicodeDammit(html_file, smart_quotes_to="html", is_html=True) if unicode_html.unicode_markup is None: raise ValueError("no HTML provided") if not unicode_html.unicode_markup: raise ValueError("could not detect character encoding") return lxml.html.fromstring(unicode_html.unicode_markup)
def extract(pagedata, pagefile): ret = [] try: root = etree.HTML(pagedata) except etree.LxmlError as e: log.error('<%s> malformed: %s', pagefile, e) return ret table = root.find('.//table[@summary]') if table is None: log.error('<%s> malformed: summary table not found', pagefile) return ret country = None for e in table.iter('a', 'td'): if e.tag == 'td': cc = None for se in e: if se.tag == 'img': cc = se.get('alt') break if cc: country = '%s (%s)' % (e.xpath('string()').strip(), cc) elif e.tag == 'a': if e.text == 'HTTP': url = e.get('href') if not url.endswith('/'): url += '/' ret.append((url, country)) return ret
def generate(url, pagedata, pagefile, redirfile, repl): log.info('generate %s', redirfile) mtime = os.stat(pagefile).st_mtime try: root = etree.HTML(pagedata) except etree.LxmlError as e: log.error('<%s> malformed: %s', pagefile, e) return 2 table = root.find('.//table[@summary]') if table is None: log.error('<%s> malformed: summary table not found', pagefile) return 3 try: fd = open(redirfile, 'w') except Exception as e: log.error(e) else: fd.write('''# # this file was automatically generated based on # %s from %s # #abort .html #abort .jpg #abort .png #abort .jpeg #abort .gif #abort .html #abort .shtml #abort .java #abort .jar #abort .htm # openSUSE Headquarter regexi ^http://download.opensuse.org/(.*)$ %s ''' % (url, email.utils.formatdate(mtime, localtime = True), repl)) country = None for e in table.iter('a', 'td'): if e.tag == 'td': cc = None for se in e: if se.tag == 'img': cc = se.get('alt') break if cc: c = '# %s (%s)\n' % (e.xpath('string()').strip(), cc) if c != country: country = c fd.write(c) elif e.tag == 'a': if e.text == 'HTTP': url = e.get('href') if not url.endswith('/'): url += '/' fd.write('regexi ^%s(.*)$ %s\n' % (url, repl)) fd.write('\n') fd.close() return 0