我们从Python开源项目中,提取了以下41个代码示例,用于说明如何使用xml.etree.ElementTree.XMLParser()。
def bug_200708_close(): """ Test default builder. >>> parser = ET.XMLParser() # default >>> parser.feed("<element>some text</element>") >>> summarize(parser.close()) 'element' Test custom builder. >>> class EchoTarget: ... def close(self): ... return ET.Element("element") # simulate root >>> parser = ET.XMLParser(EchoTarget()) >>> parser.feed("<element>some text</element>") >>> summarize(parser.close()) 'element' """
def __init__(self, xml=None, rootnode=None): if xml is None and rootnode is None: xml = default_xml if rootnode is None: if sys.platform.startswith('win'): enc = 'ISO-8859-1' else: enc = 'UTF-8' self.dom = ElementTree.fromstring(xml, ElementTree.XMLParser(encoding=enc)) else: self.dom = rootnode # determine OME namespaces self.ns = get_namespaces(self.dom) if __name__ == '__main__': if self.ns['ome'] is None: raise Exception("Error: String not in OME-XML format") # generate a uuid if there is none # < OME UUID = "urn:uuid:ef8af211-b6c1-44d4-97de-daca46f16346" omeElem = self.dom if not omeElem.get('UUID'): omeElem.set('UUID', 'urn:uuid:'+str(uuid.uuid4())) self.uuidStr = omeElem.get('UUID')
def _parse_documentation(xml): if xml.strip() == '': return '' parser = ElementTree.XMLParser(encoding='UTF-8') root = ElementTree.XML(xml.encode('UTF-8'), parser=parser) summary = '' params = [] returns = '' note = '' for node in root: if node.tag == 'summary': summary = _parse_documentation_content(node) elif node.tag == 'param': doc = _parse_documentation_content(node).replace('\n','') params.append('%s: %s' % (snake_case(node.attrib['name']), doc)) elif node.tag == 'returns': returns = 'Returns:\n %s' % _parse_documentation_content(node).replace('\n','') elif node.tag == 'remarks': note = 'Note: %s' % _parse_documentation_content(node) if len(params) > 0: params = 'Args:\n%s' % '\n'.join(' '+x for x in params) else: params = '' return '\n\n'.join(filter(lambda x: x != '', [summary, params, returns, note]))
def __init__(self, source, java_binary, tabsize): self._perform_stmts = [] if hasattr(source, 'name'): self._source_path = source.name else: self._source_path = '<string>' # Just grab a temp file name for the results result_file = NamedTemporaryFile(mode='wb', suffix='.xml', delete=False) result_file.close() try: self._code = run_koopa(source, result_file.name, java_binary=java_binary, tabsize=tabsize) parser = ET.XMLParser(target=CommentTreeBuilder()) self._tree = ET.parse(result_file.name, parser=parser) finally: if result_file: os.remove(result_file.name) self._parse()
def parsexml_(infile, parser=None, **kwargs): if parser is None: # Use the lxml ElementTree compatible parser so that, e.g., # we ignore comments. try: parser = etree_.ETCompatXMLParser() except AttributeError: # fallback to xml.etree parser = etree_.XMLParser() doc = etree_.parse(infile, parser=parser, **kwargs) return doc # # User methods # # Calls to the methods in these classes are generated by generateDS.py. # You can replace these methods by re-implementing the following class # in a module named generatedssuper.py.
def parse_xml(file_name, lh, include_comments=False): """Returns a parsed xml tree with comments intact.""" fh = open(file_name, 'r') try: if include_comments: tree = XmlET.parse(fh, parser=XmlET.XMLParser(target=CommentedTreeBuilder())) else: tree = XmlET.parse(fh, parser=XmlET.XMLParser(target=DoctypeSafeCallbackTarget())) except Exception, e: fh.close() lh.write("Exception attempting to parse %s:\n%s\n\n" % (file_name, str(e))) return None fh.close() root = tree.getroot() ElementInclude.include(root) return tree
def search_for_image(self): """ Use LastFM's API to obtain a URL for the album cover art """ response = urllib.urlopen(self.url).read() # Send HTTP request to LastFM #Due to a change in the API, Namespace prefix is not defined an causes Errors! #Hotfix: Use "lxml" instead of "xml" parser = ETree.XMLParser(recover=True) xml_data = ETree.fromstring(response, parser) # Read in XML data for element in xml_data.getiterator("album"): if (element.find('artist').text.lower() == self.artist_name.lower().encode("utf-8")): for elmnt in element.findall('image'): if (elmnt.attrib['size'] == 'extralarge'): url = elmnt.text if url: return url else: return None
def parse_vv_xml(xmlfile): parse_vv_clear() # Fix in XML File @see http://stackoverflow.com/a/7265260 # @see http://effbot.org/elementtree/elementtree-xmlparser.htm#tag-ET.XMLParser.entity parser = ElementTree.XMLParser() # TODO: UseForeignDTD is not compatible with Python 3 # parser.parser.UseForeignDTD(True) parser.entity['nbsp'] = chr(0x160) etree = ElementTree.ElementTree() # Workaround for the UseForeignDTD problem # if missing, adds DOCTYPE tag to handle and changes encoding to utf-8 doctype = '''<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE CourseCatalogue [<!ENTITY nbsp " ">]>\n''' replace_first_line(xmlfile, doctype) root = etree.parse(xmlfile, parser=parser).find('CourseCatalogueArea') root_cat = ImportCategory.objects.create(parent=None, name='root', rel_level=None) parse_vv_recurse(root, root_cat)
def verify_xml(self, incoming_xml, respond_obj, file=False): """ Verify the incoming_xml data with either a. whole xml file b. tag text pairs :param: incoming_xml: an xml string respond_obj: contains the verification detail from datafile file: indicate if comparing whole file or just pairs :return: True if whole file match/all pairs match False if not match """ if file: status = False for expect_xml_file in respond_obj["request_verify_data"]: expect_xml_file = getAbsPath(expect_xml_file, getDirName(self.datafile)) status, _, _, _ = compare_xml(incoming_xml, expect_xml_file, output_file=False, sorted_json=False, remove_namespaces=True) return status else: incoming_xml = ET.fromstring(incoming_xml, parser=ET.XMLParser(encoding="utf-8")) for element_pair in respond_obj["request_verify"]: xpath = element_pair.split(",")[0][4:] value = element_pair.split(",")[1][6:] incoming_value = getChildElementWithSpecificXpath(incoming_xml, xpath) if incoming_value is None or value != incoming_value.text: return False return True
def loadXML(xml_string, normalize = True): """ Load XML from string """ if normalize: xml_string = xml_string.replace("\n"," ").replace(" ","") parser = ET.XMLParser(encoding = 'utf-8') return ET.fromstring(xml_string, parser = parser)
def entity(): """ Test entity handling. 1) good entities >>> e = ET.XML("<document title='舰'>test</document>") >>> serialize(e, encoding="us-ascii") b'<document title="舰">test</document>' >>> serialize(e) '<document title="\u8230">test</document>' 2) bad entities >>> normalize_exception(ET.XML, "<document>&entity;</document>") Traceback (most recent call last): ParseError: undefined entity: line 1, column 10 >>> normalize_exception(ET.XML, ENTITY_XML) Traceback (most recent call last): ParseError: undefined entity &entity;: line 5, column 10 3) custom entity >>> parser = ET.XMLParser() >>> parser.entity["entity"] = "text" >>> parser.feed(ENTITY_XML) >>> root = parser.close() >>> serialize(root) '<document>text</document>' """
def parseMeta(self, schid, clid): (error, meta) = ts3.getClientVariable(schid, clid, ts3defines.ClientProperties.CLIENT_META_DATA) # print(re.search('(<{0}.*>.*</{0}>)'.format(self.tag), meta)) try: meta = re.search('<{0}>(.*)</{0}>'.format(self.tag), meta).group(0) except AttributeError: return False # print('meta_strip: %s'%meta.strip()) # print('xml: %s'%xml.fromstring(meta.strip(), parser = xml.XMLParser(encoding="utf-8"))) # print('xml_sub: %s'%xml.fromstring(meta.strip(), parser = xml.XMLParser(encoding="utf-8")).getchildren()) # print('xml_sub[0]: %s'%xml.fromstring(meta.strip(), parser = xml.XMLParser(encoding="utf-8")).getchildren()[0]) return xml.fromstring(meta.strip(), parser = xml.XMLParser(encoding="utf-8")).getchildren()[0]
def entity(): """ Test entity handling. 1) good entities >>> e = ET.XML("<document title='舰'>test</document>") >>> serialize(e) '<document title="舰">test</document>' 2) bad entities >>> ET.XML("<document>&entity;</document>") Traceback (most recent call last): ParseError: undefined entity: line 1, column 10 >>> ET.XML(ENTITY_XML) Traceback (most recent call last): ParseError: undefined entity &entity;: line 5, column 10 3) custom entity >>> parser = ET.XMLParser() >>> parser.entity["entity"] = "text" >>> parser.feed(ENTITY_XML) >>> root = parser.close() >>> serialize(root) '<document>text</document>' """
def parseXML(self): assert self.filepath.endswith(XML_EXT), "Unsupport file format" parser = etree.XMLParser(encoding='utf-8') xmltree = ElementTree.parse(self.filepath, parser=parser).getroot() filename = xmltree.find('filename').text try: verified = xmltree.attrib['verified'] if verified == 'yes': self.verified = True except KeyError: self.verified = False for object_iter in xmltree.findall('object'): typeItem = object_iter.find('type') # print(typeItem.text) if typeItem.text == 'bndbox': bndbox = object_iter.find("bndbox") label = object_iter.find('name').text # Add chris difficult = False if object_iter.find('difficult') is not None: difficult = bool(int(object_iter.find('difficult').text)) self.addShape(label, bndbox, difficult) # You Hao 2017/06/21 # add to load robndbox elif typeItem.text == 'robndbox': robndbox = object_iter.find('robndbox') label = object_iter.find('name').text difficult = False if object_iter.find('difficult') is not None: difficult = bool(int(object_iter.find('difficult').text)) self.addRotatedShape(label, robndbox, difficult) else: pass return True
def __init__(self, file_name): with open(file_name, encoding='utf-8') as file: content = file.read() if not any(u in content for u in ('utf-8', 'utf8', 'UTF8', 'UTF-8')): raise ValueError("XML file is not encoded in UTF-8. Please recode " "the file or extend this parser and XML writer.") tei_start = content.find('<TEI') if tei_start < 0: raise ValueError("Couldn't find string `<TEI` in the XML file. Please extend this parser.") self.before_root = content[:tei_start] content = content[tei_start:] tei_end = content.find('</TEI>') if tei_end < 0: raise ValueError("Couldn't find `</TEI>` in the input file, please extend the parser.") tei_end += len('</TEI>') self.after_root = content[tei_end:] content = content[:tei_end] parser = ET.XMLParser(target = CommentedTreeBuilder()) try: parser.feed(content) except ET.ParseError as e: sys.stderr.write("Error while parsing input file\n") sys.stderr.write(str(e).encode(sys.getdefaultencoding()) + '\n') sys.exit(15) self.root = parser.close()
def parse_xml(path): parser = ET.XMLParser() parser.entity[" "] = unichr(160) #tree = parser.parse(path) entites = [(' ',u'\u00a0')] data_str = open(path).read() for before,after in entites: data_str = data_str.replace(before, after.encode('utf8')) root = ET.fromstring(data_str) return root
def parse_domain_xml(xml): target = GuestXmlParser() parser = XMLParser(target=target) parser.feed(xml) return parser.close()
def get_diphteria_data(): files = glob('/Users/deborah/Documents/scripts/python_work/project2016/MOH tables.xml/*/*/*.xml') # bucket_1 = open("bucket_1.txt", "w") # bucket_2 = open("bucket_2.txt", "w") # bucket_3 = open("bucket_3.txt", "w") csvfile = open("table_stuff.csv", 'w') csvwriter = csv.writer(csvfile) csvwriter.writerow(["publisher_loc", "object_ID", "year", "label", "title", "file"]) for file in files: try: parser = XMLParser(encoding="windows-1252") root = ET.parse(file, parser=parser).getroot() except: # Just ignore bad files pass document_text = str(ET.tostring(root).lower()) # If diphtheria nowhere in xml, skip the whole file if "diphtheria" not in document_text: continue # If death/mortality nowhere in xml, skip the whole file if "death" not in document_text and "mortality" not in document_text: continue # It doesn't handle colspans/rowspans yet if "colspan" in document_text or "rowspan" in document_text: continue # to only extract cause of death if "causes of death" in document_text and "cause of death" not in document_text: continue # Find all matching rows and colums rows = [[(e.text.strip() if e.text else "") for e in row.findall("*")] for row in root.findall("*//table/*/tr")] cols = list(map(list, zip(*rows))) diphtheria_rows = [row for row in rows[1:] if "diphtheria" in " ".join(row).lower()] diphtheria_cols = [col for col in cols[1:] if "diphtheria" in " ".join(col).lower()] metadata = { "publisher_loc": root.find("*//publisher-loc").text, "object_ID": root.find("*//object-id").text, "year": root.find("*//pub-date/year").text, "label": root.find("*//label").text, "title": root.find("*//book-title").text, # "table_headers": [(e.text.strip() if e.text else "") for e in root.findall("*//table/thead/tr/th")], "file": file, } if diphtheria_rows: save_csv_entry(csvwriter, rows[0], diphtheria_rows, metadata) # pprint({"rows": diphtheria_rows, "header_row": rows[0], **metadata}, stream=bucket_1) if diphtheria_cols: save_csv_entry(csvwriter, cols[0], diphtheria_cols, metadata) # pprint({"cols": diphtheria_cols, "header_col": cols[0], **metadata}, stream=bucket_2) # if not diphtheria_rows and not diphtheria_cols: # pprint(metadata, stream=bucket_3) # bucket_1.close() # bucket_2.close() # bucket_3.close() csvfile.close()
def interface(): """ Test element tree interface. >>> element = ET.Element("tag") >>> check_element(element) >>> tree = ET.ElementTree(element) >>> check_element(tree.getroot()) >>> element = ET.Element("t\\xe4g", key="value") >>> tree = ET.ElementTree(element) >>> repr(element) # doctest: +ELLIPSIS "<Element 't\\xe4g' at 0x...>" >>> element = ET.Element("tag", key="value") Make sure all standard element methods exist. >>> check_method(element.append) >>> check_method(element.extend) >>> check_method(element.insert) >>> check_method(element.remove) >>> check_method(element.getchildren) >>> check_method(element.find) >>> check_method(element.iterfind) >>> check_method(element.findall) >>> check_method(element.findtext) >>> check_method(element.clear) >>> check_method(element.get) >>> check_method(element.set) >>> check_method(element.keys) >>> check_method(element.items) >>> check_method(element.iter) >>> check_method(element.itertext) >>> check_method(element.getiterator) These methods return an iterable. See bug 6472. >>> check_method(element.iter("tag").__next__) >>> check_method(element.iterfind("tag").__next__) >>> check_method(element.iterfind("*").__next__) >>> check_method(tree.iter("tag").__next__) >>> check_method(tree.iterfind("tag").__next__) >>> check_method(tree.iterfind("*").__next__) These aliases are provided: >>> assert ET.XML == ET.fromstring >>> assert ET.PI == ET.ProcessingInstruction >>> assert ET.XMLParser == ET.XMLTreeBuilder """
def parsefile(): """ Test parsing from file. >>> tree = ET.parse(SIMPLE_XMLFILE) >>> normalize_crlf(tree) >>> tree.write(sys.stdout, encoding='unicode') <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> tree = ET.parse(SIMPLE_NS_XMLFILE) >>> normalize_crlf(tree) >>> tree.write(sys.stdout, encoding='unicode') <ns0:root xmlns:ns0="namespace"> <ns0:element key="value">text</ns0:element> <ns0:element>text</ns0:element>tail <ns0:empty-element /> </ns0:root> >>> with open(SIMPLE_XMLFILE) as f: ... data = f.read() >>> parser = ET.XMLParser() >>> parser.version # doctest: +ELLIPSIS 'Expat ...' >>> parser.feed(data) >>> print(serialize(parser.close())) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> parser = ET.XMLTreeBuilder() # 1.2 compatibility >>> parser.feed(data) >>> print(serialize(parser.close())) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> target = ET.TreeBuilder() >>> parser = ET.XMLParser(target=target) >>> parser.feed(data) >>> print(serialize(parser.close())) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> """
def custom_builder(): """ Test parser w. custom builder. >>> with open(SIMPLE_XMLFILE) as f: ... data = f.read() >>> class Builder: ... def start(self, tag, attrib): ... print("start", tag) ... def end(self, tag): ... print("end", tag) ... def data(self, text): ... pass >>> builder = Builder() >>> parser = ET.XMLParser(target=builder) >>> parser.feed(data) start root start element end element start element end element start empty-element end empty-element end root >>> with open(SIMPLE_NS_XMLFILE) as f: ... data = f.read() >>> class Builder: ... def start(self, tag, attrib): ... print("start", tag) ... def end(self, tag): ... print("end", tag) ... def data(self, text): ... pass ... def pi(self, target, data): ... print("pi", target, repr(data)) ... def comment(self, data): ... print("comment", repr(data)) >>> builder = Builder() >>> parser = ET.XMLParser(target=builder) >>> parser.feed(data) pi pi 'data' comment ' comment ' start {namespace}root start {namespace}element end {namespace}element start {namespace}element end {namespace}element start {namespace}empty-element end {namespace}empty-element end {namespace}root """
def interface(): r""" Test element tree interface. >>> element = ET.Element("tag") >>> check_element(element) >>> tree = ET.ElementTree(element) >>> check_element(tree.getroot()) >>> element = ET.Element("t\xe4g", key="value") >>> tree = ET.ElementTree(element) >>> repr(element) # doctest: +ELLIPSIS "<Element 't\\xe4g' at 0x...>" >>> element = ET.Element("tag", key="value") Make sure all standard element methods exist. >>> check_method(element.append) >>> check_method(element.extend) >>> check_method(element.insert) >>> check_method(element.remove) >>> check_method(element.getchildren) >>> check_method(element.find) >>> check_method(element.iterfind) >>> check_method(element.findall) >>> check_method(element.findtext) >>> check_method(element.clear) >>> check_method(element.get) >>> check_method(element.set) >>> check_method(element.keys) >>> check_method(element.items) >>> check_method(element.iter) >>> check_method(element.itertext) >>> check_method(element.getiterator) These methods return an iterable. See bug 6472. >>> check_method(element.iter("tag").next) >>> check_method(element.iterfind("tag").next) >>> check_method(element.iterfind("*").next) >>> check_method(tree.iter("tag").next) >>> check_method(tree.iterfind("tag").next) >>> check_method(tree.iterfind("*").next) These aliases are provided: >>> assert ET.XML == ET.fromstring >>> assert ET.PI == ET.ProcessingInstruction >>> assert ET.XMLParser == ET.XMLTreeBuilder """
def parsefile(): """ Test parsing from file. >>> tree = ET.parse(SIMPLE_XMLFILE) >>> normalize_crlf(tree) >>> tree.write(sys.stdout) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> tree = ET.parse(SIMPLE_NS_XMLFILE) >>> normalize_crlf(tree) >>> tree.write(sys.stdout) <ns0:root xmlns:ns0="namespace"> <ns0:element key="value">text</ns0:element> <ns0:element>text</ns0:element>tail <ns0:empty-element /> </ns0:root> >>> with open(SIMPLE_XMLFILE) as f: ... data = f.read() >>> parser = ET.XMLParser() >>> parser.version # doctest: +ELLIPSIS 'Expat ...' >>> parser.feed(data) >>> print serialize(parser.close()) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> parser = ET.XMLTreeBuilder() # 1.2 compatibility >>> parser.feed(data) >>> print serialize(parser.close()) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> >>> target = ET.TreeBuilder() >>> parser = ET.XMLParser(target=target) >>> parser.feed(data) >>> print serialize(parser.close()) <root> <element key="value">text</element> <element>text</element>tail <empty-element /> </root> """
def custom_builder(): """ Test parser w. custom builder. >>> with open(SIMPLE_XMLFILE) as f: ... data = f.read() >>> class Builder: ... def start(self, tag, attrib): ... print "start", tag ... def end(self, tag): ... print "end", tag ... def data(self, text): ... pass >>> builder = Builder() >>> parser = ET.XMLParser(target=builder) >>> parser.feed(data) start root start element end element start element end element start empty-element end empty-element end root >>> with open(SIMPLE_NS_XMLFILE) as f: ... data = f.read() >>> class Builder: ... def start(self, tag, attrib): ... print "start", tag ... def end(self, tag): ... print "end", tag ... def data(self, text): ... pass ... def pi(self, target, data): ... print "pi", target, repr(data) ... def comment(self, data): ... print "comment", repr(data) >>> builder = Builder() >>> parser = ET.XMLParser(target=builder) >>> parser.feed(data) pi pi 'data' comment ' comment ' start {namespace}root start {namespace}element end {namespace}element start {namespace}element end {namespace}element start {namespace}empty-element end {namespace}empty-element end {namespace}root """
def tei_to_chapters(fname): """ Convert a TEI 2 xml into an array of chapters with text, and return the title. """ data = codecs.open(fname, 'r', 'utf-8').read().replace(' ', '') utf8_parser = etree.XMLParser(encoding='utf-8') book = etree.fromstring(data.encode('utf-8'), parser=utf8_parser) all_text = u"" chapters = [] chap_title = '' text = '' title = '' for item in book.iter(): if item.tag == 'author': author = item.text if item.tag == 'title' and not title and \ item.attrib.get('type') and item.attrib.get('type') == 'main': title = item.text if item.tag == 'head': if item.attrib and item.attrib.get('rend') and \ item.attrib.get('rend') == 'h2' and not item.text is None: chap_title = item.text if item.tag == 'head': if item.attrib and item.attrib.get('rend') and \ item.attrib.get('rend') == 'h3' and not item.text is None: chap_title += '\n' + item.text if item.tag == 'div': if item.attrib and item.attrib.get('type') and \ item.attrib.get('type') == 'chapter': all_text += text chapters.append([chap_title, text]) text = '' chap_title = '' if 'rend' in item.attrib and not item.text is None: text += item.text + "\n" if item.tag == "p" and not item.text is None: text += item.text + "\n" chapters.append([chap_title, text]) return author, title, chapters, all_text