Python xml.etree.cElementTree 模块,iterparse() 实例源码

我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用xml.etree.cElementTree.iterparse()

项目:Orator-Google-App-Engine    作者:MakarenaLabs    | 项目源码 | 文件源码
def _find_elements(self, result, elements):
    """Find interesting elements from XML.

    This function tries to only look for specified elements
    without parsing the entire XML. The specified elements is better
    located near the beginning.

    Args:
      result: response XML.
      elements: a set of interesting element tags.

    Returns:
      A dict from element tag to element value.
    """
    element_mapping = {}
    result = StringIO.StringIO(result)
    for _, e in ET.iterparse(result, events=('end',)):
      if not elements:
        break
      if e.tag in elements:
        element_mapping[e.tag] = e.text
        elements.remove(e.tag)
    return element_mapping
项目:lopeningent_backend    作者:oSoc17    | 项目源码 | 文件源码
def load_osm(osm_file):
    nodes = dict()

    for event, elem in iterparse(osm_file, events=("start", "end")):
        # Whenever the iterator encounters an opening tag
        if event == "start":
            if elem.tag == "node":
                curr_id = int(elem.attrib["id"])
                lat = float(elem.attrib["lat"])
                lon = float(elem.attrib["lon"])
                curr_elem = (lat, lon)

        # Whenever the iterator encounters a closing tag
        elif event == "end":
            if elem.tag == "node":
                nodes[curr_id] = curr_elem

    return nodes
项目:route-plotter    作者:perimosocordiae    | 项目源码 | 文件源码
def gpx_parser(fh):
  it = ElementTree.iterparse(fh, events=('start','end'))
  # look for the start gpx tag to fail fast
  for event, elem in it:
    if event == 'start' and elem.tag.endswith('}gpx'):
      break
  else:
    raise ValueError('Not a gpx file: %s' % fh.name)

  # do the main parse
  for event, elem in it:
    if event == 'end' and elem.tag.endswith('}trkpt'):
      latlon = (float(elem.attrib['lat']),
                float(elem.attrib['lon']))
      elev = np.nan
      time = None
      for child in elem:
        tag_name = child.tag.rsplit('}', 1)[1]
        if tag_name == 'ele':
          elev = float(child.text)
        elif tag_name == 'time':
          time = child.text
      yield latlon, time, elev
      elem.clear()
项目:deb-python-pysaml2    作者:openstack    | 项目源码 | 文件源码
def parse_nsmap(fil):
    events = "start", "start-ns", "end-ns"

    root = None
    ns_map = []

    for event, elem in ElementTree.iterparse(fil, events):
        if event == "start-ns":
            ns_map.append(elem)
        elif event == "end-ns":
            ns_map.pop()
        elif event == "start":
            if root is None:
                root = elem
            elem.set(NS_MAP, dict(ns_map))

    return ElementTree.ElementTree(root)
项目:enkiWS    作者:juliettef    | 项目源码 | 文件源码
def _find_elements(self, result, elements):
    """Find interesting elements from XML.

    This function tries to only look for specified elements
    without parsing the entire XML. The specified elements is better
    located near the beginning.

    Args:
      result: response XML.
      elements: a set of interesting element tags.

    Returns:
      A dict from element tag to element value.
    """
    element_mapping = {}
    result = StringIO.StringIO(result)
    for _, e in ET.iterparse(result, events=('end',)):
      if not elements:
        break
      if e.tag in elements:
        element_mapping[e.tag] = e.text
        elements.remove(e.tag)
    return element_mapping
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def _poll(self, url):
        request = urllib2.Request(url)
        for key, value in self.http_headers:
            request.add_header(key, value)

        try:
            self.log.info('Downloading feed from: "%s"', url)
            _, fileobj = yield utils.fetch_url(request)
        except utils.FetchUrlFailed as e:
            self.log.error('Failed to download feed "%s": %r', url, e)
            idiokit.stop(False)

        self.log.info("Finished downloading the feed.")

        byte = fileobj.read(1)
        while byte and byte != "<":
            byte = fileobj.read(1)

        if byte == "<":
            fileobj.seek(-1, 1)
            try:
                for _, elem in etree.iterparse(fileobj):
                    for event in self._parse(elem, url):
                        if event:
                            yield idiokit.send(event)
            except ParseError as e:
                self.log.error('Invalid format on feed: "%s", "%r"', url, e)
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def poll(self):
        url = self.feed_url % self.application_key

        try:
            self.log.info("Checking if {0!r} has new data".format(url))
            info, _ = yield utils.fetch_url(HeadRequest(url))

            etag = info.get("etag", None)
            if etag is not None and self._etag == etag:
                raise bot.PollSkipped("no new data detected (ETag stayed the same)")

            self.log.info("Downloading data from {0!r}".format(url))
            _, fileobj = yield utils.fetch_url(url)
        except utils.FetchUrlFailed as error:
            raise bot.PollSkipped("failed to download {0!r} ({1})".format(url, error))

        self.log.info("Downloaded data from {0!r}".format(url))

        reader = BZ2Reader(fileobj)
        try:
            depth = 0
            sites = dict()

            for event, element in etree.iterparse(reader, events=("start", "end")):
                if event == "start" and element.tag == "entry":
                    depth += 1

                if event == "end" and element.tag == "entry":
                    yield self._handle_entry(element, sites)
                    depth -= 1

                if event == "end" and depth == 0:
                    element.clear()
        except SyntaxError as error:
            raise bot.PollSkipped("syntax error in report {0!r} ({1})".format(url, error))
        else:
            self._etag = etag
项目:setlr    作者:tetherless-world    | 项目源码 | 文件源码
def iterparse(self, file):
        return self.create_fa().iterparse(file, self.validate_dtd)
    # I need a better name
项目:setlr    作者:tetherless-world    | 项目源码 | 文件源码
def handler_parse(self, file, state=None):
        for x in self.parse(file, state):
            pass

    # I plan to implement 'iterparse' as a near copy of 'parse'
    # but without any references to callbacks
项目:setlr    作者:tetherless-world    | 项目源码 | 文件源码
def iterparse(self, file, validate_dtd=False):
        return self.parse(file, None, validate_dtd)
项目:setlr    作者:tetherless-world    | 项目源码 | 文件源码
def test_parse():
    import os
    filename = "/Users/dalke/Music/iTunes/iTunes Music Library.xml"
    if not os.path.exists(filename):
        print "Cannot find %r: skipping test" % (filename,)
        return

    # Work through callbacks
    ef = IterParseFilter()
    def print_info(event, ele, state):
        d = {}
        children = iter(ele)
        for child in children:
            key = child.text
            value = children.next().text
            d[key] = value
        print "%r is by %r" % (d["Name"], d.get("Artist", "<unknown>"))
        ele.clear()

    ef.on_end("/plist/dict/dict/dict", print_info)
    ef.handler_parse(open(filename))

    # Work through iterators
    ef = IterParseFilter()
    ef.iter_end("/plist/dict/dict/dict")
    for (event, ele) in ef.iterparse(open(filename)):
        d = {}
        children = iter(ele)
        for child in children:
            key = child.text
            value = children.next().text
            d[key] = value
        print "%r is a %r song" % (d["Name"], d.get("Genre", "<unknown>"))
        ele.clear()
项目:AlexaPi    作者:alexa-pi    | 项目源码 | 文件源码
def parse_new_asx(data):
    # Copied from mopidy.audio.playlists
    try:
        for _, element in elementtree.iterparse(data):
            element.tag = element.tag.lower()  # normalize
            for ref in element.findall('entry/ref[@href]'):
                yield fix_asf_uri(ref.get('href', '').strip())

            for entry in element.findall('entry[@href]'):
                yield fix_asf_uri(entry.get('href', '').strip())
    except elementtree.ParseError:
        return
项目:ns3-rdma    作者:bobzhuyb    | 项目源码 | 文件源码
def main(argv):
    file_obj = open(argv[1])
    print "Reading XML file ",

    sys.stdout.flush()        
    level = 0
    sim_list = []
    for event, elem in ElementTree.iterparse(file_obj, events=("start", "end")):
        if event == "start":
            level += 1
        if event == "end":
            level -= 1
            if level == 0 and elem.tag == 'FlowMonitor':
                sim = Simulation(elem)
                sim_list.append(sim)
                elem.clear() # won't need this any more
                sys.stdout.write(".")
                sys.stdout.flush()
    print " done."


    for sim in sim_list:
        for flow in sim.flows:
            t = flow.fiveTuple
            proto = {6: 'TCP', 17: 'UDP'} [t.protocol]
            print "FlowID: %i (%s %s/%s --> %s/%i)" % \
                (flow.flowId, proto, t.sourceAddress, t.sourcePort, t.destinationAddress, t.destinationPort)
            print "\tTX bitrate: %.2f kbit/s" % (flow.txBitrate*1e-3,)
            print "\tRX bitrate: %.2f kbit/s" % (flow.rxBitrate*1e-3,)
            print "\tMean Delay: %.2f ms" % (flow.delayMean*1e3,)
            print "\tPacket Loss Ratio: %.2f %%" % (flow.packetLossRatio*100)
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def process_file(self, file_name):
        data = self.read_file(file_name, self.encoding)
        data = self.preprocess_data(data)
        try:
            stream = IO_Stream(bytearray("\n".join(data), encoding="utf-8"))
            self.tree = ET.iterparse(stream)
            if self._strip_namespace:
                for _, element in self.tree:
                    element.tag = element.tag.rpartition("}")[-1]
        except Exception as e:
            print(self._current_file)
            print_error_context(str(e), "\n".join(data).split("\n"))
            raise e
        self.process_tree(self.tree)
项目:wos_parser    作者:alexander-belikov    | 项目源码 | 文件源码
def parse_wos_xml(fp, global_year, good_cf, bad_cf, ntest=None):
    """
    driver func, parse file fp, push good and bad records
    accordingly to good_cf and bad_cf

    :param fp: filepointer to be parsed
    :param global_year: apriori known year
    :param good_cf: chunk flusher of good records
    :param bad_cf: chunk flusher of bad records
    :param ntest: number of records for test mode
    :return:
    """
    events = ('start', 'end')
    tree = cET.iterparse(fp, events)
    context = iter(tree)
    event, root = next(context)
    rec_ = 'REC'
    it = 0

    for event, pub in context:
        if event == "end" and pub.tag == rec_:
            ans = parse_record(pub, global_year)
            if ans[0]:
                good_cf.push(ans[1])
            else:
                msg = ' parse_wos_xml() : wos_id {0} failed ' \
                      'to parse, placed in the bad heap'.format(ans[1]['id'])
                logging.error(msg)
                bad_cf.push(ans[1])
            if not good_cf.ready() or not bad_cf.ready():
                break
            root.clear()
            it += 1
            if ntest and it >= ntest:
                break
项目:lopeningent_backend    作者:oSoc17    | 项目源码 | 文件源码
def load_osm(osm_file):
    """
    loads all edges and nodes from the .osm (XML) file 
    and wraps them into objects.

    :param osm_file: filename of the .osm file containing the map data.
    :return: edges (list), nodes (dict)
    """
    nodes = dict()
    edges = list()

    for event, elem in iterparse(osm_file, events=("start", "end")):
        # Whenever the iterator encounters an opening tag
        if event == "start":
            if elem.tag == "node":
                curr_id = int(elem.attrib["id"])
                lat = float(elem.attrib["lat"])
                lon = float(elem.attrib["lon"])
                curr_elem = (lat, lon)
            elif elem.tag == "way":
                curr_elem = Edge(int(elem.attrib["id"]))
            elif elem.tag == "nd":
                curr_elem.nodes.append(elem.attrib["ref"])

        # Whenever the iterator encounters a closing tag
        elif event == "end":
            if elem.tag == "node":
                nodes[curr_id] = curr_elem
            elif elem.tag == "way":
                edges.append(curr_elem)

    return nodes, edges
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def get_element(osm_file, tags=('node', 'way', 'relation')):

    """
    Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def count_tags(filename):

    tags={}

    for event, elem in ET.iterparse(filename, events=("start",)):

        if elem.tag in tags.keys():
            tags[elem.tag] += 1
        else:
            tags[elem.tag] = 1

    return tags
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    print "Sample 'other' tags, randomly (2%) selected:"
    print "\n"

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    print "\n"
    print "Count of the four Tag Categories:"
    print "\n"
    print keys
    return keys
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):

        if get_user(element):
            users.add(get_user(element))

    return users
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def audit_street(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def audit_state(osmfile):
    osm_file = open(osmfile, "r")
    prob_state = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_state(tag):

                    if tag.attrib['v'] != 'HI':
                        prob_state.add(tag.attrib['v'])
    osm_file.close()
    return prob_state
项目:Udacity-DAND-Project-3    作者:sfox1975    | 项目源码 | 文件源码
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
项目:route-plotter    作者:perimosocordiae    | 项目源码 | 文件源码
def tcx_parser(fh):
  it = ElementTree.iterparse(fh, events=('start','end'))
  # look for the start TrainingCenterDatabase tag to fail fast
  for event, elem in it:
    if event == 'start' and elem.tag.endswith('}TrainingCenterDatabase'):
      break
  else:
    raise ValueError('Not a tcx file: %s' % fh.name)

  # do the main parse
  for event, elem in it:
    if event == 'end' and elem.tag.endswith('}Trackpoint'):
      latlon = None
      elev = np.nan
      time = None
      for child in elem:
        tag_name = child.tag.rsplit('}', 1)[1]
        if tag_name == 'Time':
          time = child.text
        elif tag_name == 'AltitudeMeters':
          elev = float(child.text)
        elif tag_name == 'Position':
          vals = dict((c.tag.rsplit('}', 1)[1], float(c.text)) for c in child)
          latlon = (vals['LatitudeDegrees'], vals['LongitudeDegrees'])
      if latlon is not None:
        yield latlon, time, elev
      elem.clear()
项目:ryu-lagopus-ext    作者:lagopus    | 项目源码 | 文件源码
def parse_root(raw):
    "Efficiently parses the root element of a *raw* XML document, returning a tuple of its qualified name and attribute dictionary."
    fp = StringIO(raw)
    for event, element in ET.iterparse(fp, events=('start',)):
        return (element.tag, element.attrib)
项目:aio    作者:pavhofman    | 项目源码 | 文件源码
def detect_xspf_header(data):
    data = data[0:150]
    if b'xspf' not in data.lower():
        return False

    try:
        data = io.BytesIO(data)
        for event, element in elementtree.iterparse(data, events=(b'start',)):
            return element.tag.lower() == '{http://xspf.org/ns/0/}playlist'
    except elementtree.ParseError:
        pass
    return False
项目:aio    作者:pavhofman    | 项目源码 | 文件源码
def detect_asx_header(data: bytes):
    data = data[0:50]
    if b'asx' not in data.lower():
        return False

    try:
        bytesIO = io.BytesIO(data)
        for event, element in elementtree.iterparse(bytesIO, events=(b'start',)):
            return element.tag.lower() == 'asx'
    except elementtree.ParseError:
        pass
    return False
项目:aio    作者:pavhofman    | 项目源码 | 文件源码
def parse_xspf(data: bytes):
    try:
        # Last element will be root.
        element = None
        for event, element in elementtree.iterparse(io.BytesIO(data)):
            element.tag = element.tag.lower()  # normalize
        if element is not None:
            ns = 'http://xspf.org/ns/0/'
            for track in element.iterfind('{%s}tracklist/{%s}track' % (ns, ns)):
                yield track.findtext('{%s}location' % ns)
    except elementtree.ParseError:
        return
项目:aio    作者:pavhofman    | 项目源码 | 文件源码
def parse_asx(data):
    try:
        # Last element will be root.
        element = None
        for event, element in elementtree.iterparse(io.BytesIO(data)):
            element.tag = element.tag.lower()  # normalize

        if element is not None:
            for ref in element.findall('entry/ref[@href]'):
                yield ref.get('href', '').strip()

            for entry in element.findall('entry[@href]'):
                yield entry.get('href', '').strip()
    except elementtree.ParseError:
        return
项目:mywebsite    作者:areebbeigh    | 项目源码 | 文件源码
def is_svg(self, f):
        """
        Check if provided file is svg
        """
        f.seek(0)
        tag = None
        try:
            for event, el in et.iterparse(f, ('start',)):
                tag = el.tag
                break
        except et.ParseError:
            pass
        return tag == '{http://www.w3.org/2000/svg}svg'
项目:osm-data-wrangling    作者:yajiez    | 项目源码 | 文件源码
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
项目:osm-data-wrangling    作者:yajiez    | 项目源码 | 文件源码
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
项目:liveqa2017    作者:codekansas    | 项目源码 | 文件源码
def iterate_qa_pairs(num_iter=None):
    """Iterates through question-answer pairs in a single file.

    Args:
        num_iter: int (default: None), number of times to iterate. If None,
            iterates infinitely.

    Yields:
        subject: the question title (max length = QUESTION_TITLE_MAXLEN)
        bestanswer: the body of the best answer
            (max length = ANSWER_MAXLEN)
    """

    def _parse_document(elem):
        subject = elem.find('subject')
        bestanswer = elem.find('bestanswer')

        return ('' if subject is None else subject.text,
                '' if bestanswer is None else bestanswer.text)

    if num_iter is None:
        iterator = itertools.count()
    else:
        iterator = xrange(num_iter)

    for _ in iterator:
        with open(DATA_PATH, 'r') as f:
            parser = ET.iterparse(f)
            for event, elem in parser:
                if elem.tag == 'document':
                    yield _parse_document(elem)
                    elem.clear()  # Important for avoiding memory issues.
项目:file-metadata    作者:pywikibot-catfiles    | 项目源码 | 文件源码
def is_svg(_file):
    """
    Check is a given file is SVG or not. A file is considered to be SVG if:

    - Its mimetype is "application/svg+xml" or "image/svg+xml".
    - Its mimetype is "text/html" or "application/xml" or "text/xml" or
      "text/plain" and it has the svg tag with xmlns http://www.w3.org/2000/svg

    :param _file: A GenericFile object that should be checked for SVG.
    :return:      Boolean corresponding to whether the file is SVG.
    """
    mime = _file.mime()
    if mime in ('application/svg+xml', 'image/svg+xml'):
        return True
    elif mime in ('application/xml', 'text/xml', 'text/html', 'text/plain'):
        tag = None
        with open(_file.fetch('filename'), "r") as f:
            # cElementTree needs the events as bytes in python2
            items = cElementTree.iterparse(f, events=(str('start'),))
            try:
                _, el = items.next()
                tag = el.tag
            except cElementTree.ParseError:
                return False
        return tag == '{http://www.w3.org/2000/svg}svg'
项目:wos_builder    作者:yadudoc    | 项目源码 | 文件源码
def load_data(datafile):    
    context = ET.iterparse(datafile, events=("start", "end"))
    logging.debug("Got context")
    context = iter(context)    
    return context

# uid -> wos_id, citedAuthor, year , page, volume, citedTitle, citedWork, doi
项目:wos_builder    作者:yadudoc    | 项目源码 | 文件源码
def load_data(datafile):    
    context = ET.iterparse(datafile, events=("start", "end"))
    logging.debug("Got context")
    context = iter(context)    
    return context

# uid -> wos_id, citedAuthor, year , page, volume, citedTitle, citedWork, doi
项目:processtap    作者:firodj    | 项目源码 | 文件源码
def read( self ):
        context = ElementTree.iterparse(self.gccxml_file, events=("start", "end"))
        for event, elem in context:
            if event == 'start':
                self.startElement( elem.tag, elem.attrib )
            else:
                self.endElement( elem.tag )
                elem.clear()
        self.endDocument()
项目:Alexa_MMDAgent    作者:jianmliu    | 项目源码 | 文件源码
def parse_new_asx(data):
    # Copied from mopidy.audio.playlists
    try:
        for event, element in elementtree.iterparse(data):
            element.tag = element.tag.lower()  # normalize
    except elementtree.ParseError:
        return

    for ref in element.findall('entry/ref[@href]'):
        yield fix_asf_uri(ref.get('href', '').strip())

    for entry in element.findall('entry[@href]'):
        yield fix_asf_uri(entry.get('href', '').strip())
项目:Tenable.io-SDK-for-Python    作者:tenable    | 项目源码 | 文件源码
def parse(path, tag=REPORT_HOST):
        """Parse Nessus XML export from Workbench API into dicts.

        :param path: The file path.
        :param tag: The XML tag to iterate on. It should be WorkbenchParser.REPORT_HOST or WorkbenchParser.REPORT_ITEM.
        """
        assert tag in [WorkbenchParser.REPORT_HOST, WorkbenchParser.REPORT_ITEM], u'Valid tag for parsing.'

        report_host = None
        host_properties = None
        report_items = [] if tag == WorkbenchParser.REPORT_HOST else None

        try:
            for event, elem in ET.iterparse(path, events=('start', 'end')):

                if event == 'start':
                    if elem.tag == 'ReportHost':
                        report_host = WorkbenchParser._from_report_host(elem)

                if event == 'end':

                    if elem.tag == WorkbenchParser.REPORT_HOST:
                        elem.clear()
                        if tag == elem.tag:
                            yield {
                                'report_host': report_host,
                                'host_properties': host_properties,
                                'report_items': report_items,
                            }
                            report_items = []

                    if elem.tag == WorkbenchParser.HOST_PROPERTIES:
                        host_properties = WorkbenchParser._from_host_properties(elem)
                        elem.clear()

                    if elem.tag == WorkbenchParser.REPORT_ITEM:
                        report_item = WorkbenchParser._from_report_item(elem)
                        elem.clear()
                        if tag == elem.tag:
                            yield report_item
                        elif tag == WorkbenchParser.REPORT_HOST:
                            report_items.append(report_item)
        except ET.ParseError as e:
            logging.warn(u'Failed to parse Nessus XML: ' + e.msg)
            # TODO The service return malformed XML for empty set, for now we won't raise an exception for what should
            # TODO be a normal state. However, this might masked out real error from bubble up (unlikely).
            # raise TenableIOException(u'Failed to parse Nessus XML: ' + e.message)
项目:appcompatprocessor    作者:mbevilacqua    | 项目源码 | 文件源码
def read_mir(xml_file, quiet=False):
    out_list = []
    tmp_list = []
    error = ""

    # Open the MIR output file.
    try:
        for (_, reg_item) in et.iterparse(xml_file, events=('end',)):
            if reg_item.tag != 'RegistryItem':
                continue

            path_name = reg_item.find("Path").text
            if not path_name:
                print "[-] Error XML missing Path"
                print et.tostring(reg_item)
                reg_item.clear()
                continue
            path_name = path_name.lower()

            # Check to see that we have the right registry value.
            if 'control\\session manager\\appcompatcache\\appcompatcache' in path_name \
                or 'control\\session manager\\appcompatibility\\appcompatcache' in path_name:
                # return the base64 decoded value data.
                bin_data = binascii.a2b_base64(reg_item.find('Value').text)
                tmp_list = read_cache(bin_data, quiet)

                if tmp_list:
                    for row in tmp_list:
                        if g_verbose:
                            row.append(path_name)
                        if row not in out_list:
                            out_list.append(row)
            reg_item.clear()
    except (AttributeError, TypeError, IOError),  err:
        error = "[-] Error reading MIR XML: %s" % str(err)
        print error
        return (error, None)

    if len(out_list) == 0:
        return (error, None)
    else:
        # Add the header and return the list.
        if g_verbose:
            out_list.insert(0, output_header + ['Key Path'])
        else:
        # Only return unique entries.
            out_list = unique_list(out_list)
            out_list.insert(0, output_header)

    return (error, out_list)

# Get Shim Cache data from .reg file.
# Finds the first key named "AppCompatCache" and parses the
# Hex data that immediately follows. It's a brittle parser,
# but the .reg format doesn't change too often.
项目:appcompatprocessor    作者:mbevilacqua    | 项目源码 | 文件源码
def processFile(self, file_fullpath, hostID, instanceID, rowsData):
        rowNumber = 0
        check_tags = ['LastModified', 'FilePath']
        # the 'end' event signifies when the end of the XML node has been reached,
        # and therefore when all values can be parsed
        try:
            xml_data = loadFile(file_fullpath)
            for event, element in etree.iterparse(xml_data, events=("end",)):
                skip_entry = False
                tag_dict = {}
                if element.tag == "PersistenceItem":
                    self._processElement(element, tag_dict)

                    # Check we have everything we need and ignore entries with critical XML errors on them
                    for tag in check_tags:
                        if tag in tag_dict:
                            if tag_dict[tag] is None:
                                if 'AppCompatPath' in tag_dict:
                                    logger.warning("Malformed tag [%s: %s] in %s, entry: %s (skipping entry)" % (tag, tag_dict[tag], tag_dict['AppCompatPath'], file_fullpath))
                                else:
                                    logger.warning(
                                        "Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath))
                                skip_entry = True
                                break
                    # If the entry is valid do some housekeeping:
                    if not skip_entry:
                        if tag_dict['ExecutionFlag'] == '1':
                            tmpExecFlag = True
                        elif tag_dict['ExecutionFlag'] == '0':
                            tmpExecFlag = False
                        else: tmpExecFlag = tag_dict['ExecutionFlag']
                        namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__,
                              RowNumber=rowNumber,
                              InstanceID=instanceID,
                              LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'),
                              LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'),
                              FileName=ntpath.basename(tag_dict['FilePath']),
                              FilePath=ntpath.dirname(tag_dict['FilePath']),
                              Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'),
                              ExecFlag=tmpExecFlag)
                        rowsData.append(namedrow)
                        rowNumber += 1
            else:
                pass
                element.clear()
            xml_data.close()
        except Exception as e:
            print e.message
            print traceback.format_exc()
            pass
项目:appcompatprocessor    作者:mbevilacqua    | 项目源码 | 文件源码
def processFile(self, file_fullpath, hostID, instanceID, rowsData):
        rowNumber = 0
        check_tags = ['LastModified', 'AppCompatPath']
        try:
            xml_data = loadFile(file_fullpath)
            for event, element in etree.iterparse(xml_data, events=("end",)):
                skip_entry = False
                tag_dict = {}
                if element.tag == "AppCompatItemExtended":
                    self._processElement(element, tag_dict)

                    # From time to time we get some entries with no real data on them for some unknown reason, skip for now
                    if 'AppCompatPath' in tag_dict:
                        if tag_dict['AppCompatPath'] == 'N/A':
                            logger.debug("ShimCache entry with no AppCompatPath [ControlSetSeq: %s], entry: %s. (skipping entry)" % (tag_dict['ControlSetSeq'], file_fullpath))
                            break

                    # Check we have everything we need and ignore entries with critical XML errors on them
                    for tag in check_tags:
                        if tag not in tag_dict or tag_dict[tag] is None:
                            if tag not in tag_dict:
                                if 'AppCompatPath' in tag_dict:
                                    logger.warning("Missing tag [%s] in %s, entry: %s (skipping entry)" % (tag, tag_dict['AppCompatPath'], file_fullpath))
                                else:
                                    logger.warning("Malformed tag [%s] in %s, entry: Unknown (skipping entry)" % (tag, file_fullpath))
                                skip_entry = True
                                break
                            if tag_dict[tag] is None:
                                if 'AppCompatPath' in tag_dict:
                                    logger.warning("Malformed tag [%s: %s] in %s, entry: %s (skipping entry)" % (tag, tag_dict[tag], tag_dict['AppCompatPath'], file_fullpath))
                                else:
                                    logger.warning("Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath))
                                skip_entry = True
                                break

                    # If the entry is valid do some housekeeping:
                    if not skip_entry:
                        if tag_dict['ExecutionFlag'] == '1':
                            tmpExecFlag = True
                        elif tag_dict['ExecutionFlag'] == '0':
                            tmpExecFlag = False
                        else: tmpExecFlag = tag_dict['ExecutionFlag']
                        namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__,
                              RowNumber=rowNumber,
                              InstanceID=instanceID,
                              LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'),
                              LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'),
                              FileName=ntpath.basename(tag_dict['AppCompatPath']),
                              FilePath=ntpath.dirname(tag_dict['AppCompatPath']),
                              Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'),
                              ExecFlag=tmpExecFlag)
                        rowsData.append(namedrow)
                        rowNumber += 1
            else:
                pass
                element.clear()
            xml_data.close()
        except Exception as e:
            print e.message
            print traceback.format_exc()
            pass
项目:appcompatprocessor    作者:mbevilacqua    | 项目源码 | 文件源码
def processFile(self, file_fullpath, hostID, instanceID, rowsData):
        rowNumber = 0
        check_tags = ['LastModified', 'AppCompatPath']
        try:
            xml_data = loadFile(file_fullpath)
            for event, element in etree.iterparse(xml_data, events=("end",)):
                skip_entry = False
                tag_dict = {}
                if element.tag == "ShimCacheItem":
                    self._processElement(element, tag_dict)

                    # Check we have everything we need and ignore entries with critical XML errors on them
                    for tag in check_tags:
                        if tag not in tag_dict or tag_dict[tag] is None:
                                if 'AppCompatPath' in tag_dict:
                                    logger.warning("Malformed tag [%s] in %s, entry: %s (skipping entry)" % (tag, tag_dict['AppCompatPath'], file_fullpath))
                                else:
                                    logger.warning(
                                        "Malformed tag [%s: %s] in %s, entry: Unknown (skipping entry)" % (tag, tag_dict[tag], file_fullpath))
                                skip_entry = True
                                break

                    # If the entry is valid do some housekeeping:
                    if not skip_entry:
                        if 'ExecutionFlag' in tag_dict:
                            tmpExexFlag = tag_dict['ExecutionFlag']
                        else:
                            # Note that Shim Shady does not extract ExecFlag on some platforms (at least Windows 10).
                            tmpExexFlag = 'unk'
                        namedrow = settings.EntriesFields(HostID=hostID, EntryType=settings.__APPCOMPAT__,
                              RowNumber=rowNumber,
                              InstanceID=instanceID,
                              LastModified=(tag_dict['LastModified'].replace("T"," ").replace("Z","") if 'LastModified' in tag_dict else '0001-01-01 00:00:00'),
                              LastUpdate=(tag_dict['LastUpdate'].replace("T"," ").replace("Z","") if 'LastUpdate' in tag_dict else '0001-01-01 00:00:00'),
                              FileName=ntpath.basename(tag_dict['AppCompatPath']),
                              FilePath=ntpath.dirname(tag_dict['AppCompatPath']),
                              Size=(tag_dict['Size'] if 'Size' in tag_dict else 'N/A'),
                              ExecFlag=tmpExexFlag)
                        rowsData.append(namedrow)
                        rowNumber += 1
            else:
                pass
                element.clear()
            xml_data.close()
        except Exception as e:
            print e.message
            print traceback.format_exc()
            pass
项目:knowledgediscovery    作者:jakelever    | 项目源码 | 文件源码
def processAbstractFile(abstractFile, outFile, processFunction):
    count = 0

    # These XML files are huge, so skip through each MedlineCitation element using etree
    for event, elem in etree.iterparse(abstractFile, events=('start', 'end', 'start-ns', 'end-ns')):
        if (event=='end' and elem.tag=='MedlineCitation'):
            count = count + 1

            # Find the elements for the PubMed ID, and publication date information
            pmid = elem.findall('./PMID')
            yearFields = elem.findall('./Article/Journal/JournalIssue/PubDate/Year')
            medlineDateFields = elem.findall('./Article/Journal/JournalIssue/PubDate/MedlineDate')

            # Try to extract the pmidID
            pmidText = ''
            if len(pmid) > 0:
                pmidText = " ".join( [a.text.strip() for a in pmid if a.text ] )
            pmcidText = ''

            # Try to extract the publication date
            pubYear = 0
            if len(yearFields) > 0:
                pubYear = yearFields[0].text
            if len(medlineDateFields) > 0:
                pubYear = medlineDateFields[0].text[0:4]

            # Extract the title of paper
            title = elem.findall('./Article/ArticleTitle')
            titleText = extractTextFromElemList(title)
            titleText = [ removeWeirdBracketsFromOldTitles(t) for t in titleText ]

            # Extract the abstract from the paper
            abstract = elem.findall('./Article/Abstract/AbstractText')
            abstractText = extractTextFromElemList(abstract)

            # Combine all the text we want to process
            allText = titleText + abstractText
            allText = [ t for t in allText if len(t) > 0 ]
            allText = [ htmlUnescape(t) for t in allText ]
            allText = [ removeBracketsWithoutWords(t) for t in allText ]

            # Information about the source of this text
            textSourceInfo = {'pmid':pmidText, 'pmcid':pmcidText, 'pubYear':pubYear}

            # Get the co-occurrences using a single list
            processFunction(outFile, allText, textSourceInfo)

            # Important: clear the current element from memory to keep memory usage low
            elem.clear()
项目:wos_builder    作者:yadudoc    | 项目源码 | 文件源码
def tabulate(datafile, cnx):
   print "Tabulate"

   #tree = ET.parse(datafile)
   #root = tree.getroot()
   count = 0

   # get an iterable
   context = ET.iterparse(datafile, events=("start", "end"))
   logging.debug("Got context")
   # turn it into an iterator
   context = iter(context)

   # get the root element
   event, root = context.next()
   logging.debug("Got root")
   for event, elem in context:
      #if event == "start":
      #   print "Event:{0} \nElem:{1} \nAttr:{2} \nValue:{3}".format(event, elem.tag, elem.attrib, elem.value)

      if elem.tag == "REC" and event=="start":
         print "Foo"
         for child in elem:
            print child.tag, child.attrib

      #if event == "end" and elem.tag == "REC":
      #   print "Event:{0} Elem:{1}".format(event, elem)
      count = count+1
      root.clear()
      if count == 100:
         break
   '''
   for event, elem in tree.iterparse(datafile):
      print "Event:{0} Elem:{1}".format(event, elem)
      if elem.tag == "REC" :
         print "Foo"
      count = count + 1
      if count == 100:
         break
   '''

   return
项目:wos_builder    作者:yadudoc    | 项目源码 | 文件源码
def tabulate(datafile, cnx):
   print "Tabulate"

   #tree = ET.parse(datafile)
   #root = tree.getroot()
   count = 0

   # get an iterable
   context = ET.iterparse(datafile, events=("start", "end"))
   logging.debug("Got context")
   # turn it into an iterator
   context = iter(context)

   # get the root element
   event, root = context.next()
   logging.debug("Got root")
   for event, elem in context:
      #if event == "start":
      #   print "Event:{0} \nElem:{1} \nAttr:{2} \nValue:{3}".format(event, elem.tag, elem.attrib, elem.value)

      if elem.tag == "REC" and event=="start":
         print "Foo"
         for child in elem:
            print child.tag, child.attrib

      #if event == "end" and elem.tag == "REC":
      #   print "Event:{0} Elem:{1}".format(event, elem)
      count = count+1
      root.clear()
      if count == 100:
         break
   '''
   for event, elem in tree.iterparse(datafile):
      print "Event:{0} Elem:{1}".format(event, elem)
      if elem.tag == "REC" :
         print "Foo"
      count = count + 1
      if count == 100:
         break
   '''

   return