我们从Python开源项目中,提取了以下34个代码示例,用于说明如何使用_collections.defaultdict()。
def __init__(self, *args): super(IDADebugger, self).__init__(*args) self.hooked = False self.trace = Trace() self._module_name = 'IDADbg' self.arch = get_arch_dynamic() # init the cpu context with 0 if self.arch == 32: self.ctx = {c: '0' for c in ['eax', 'ebx', 'edx', 'ecx', 'ebp', 'esp', 'eip', 'edi', 'esi', 'cf', 'zf', 'sf', 'of', 'pf', 'af', 'tf', 'df']} elif self.arch == 64: self.ctx = {c: '0' for c in ['rax', 'rbx', 'rdx', 'rcx', 'rbp', 'rsp', 'rip', 'edi', 'rsi', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'cf', 'zf', 'sf', 'of', 'pf', 'af', 'tf', 'df']} self.IAT = [] self.func_args = defaultdict(lambda: set())
def get_named_entities(documents, mincount=10): ''' given a list of texts find words that more than 50% of time start with a capital letter and return them as NE ''' word_count = defaultdict(int) word_capital = defaultdict(int) NEs = [] token_pattern = r'(?u)(?<![#@])\b\w+\b' tp = re.compile(token_pattern) for doc in documents: words = tp.findall(doc) for word in words: if word[0].isupper(): word_capital[word.lower()] += 1 word_count[word.lower()] += 1 for word, count in word_count.iteritems(): if count < mincount: continue capital = word_capital[word] percent = float(capital) / count if percent > 0.7: NEs.append(word) return NEs
def calc_log_prob_for_files(self, annotations): ''' Calculate the logprobs for the classification windows given in annotations @param annotations: Annotations as read from annotation file @return: tuple (features, labels). features is a list of logprobs-matrices for the windows. labels is numpy-array of the labels for the respective windows. ''' features = [] labels = [] annotation_dict = defaultdict(list) for anno in annotations: annotation_dict[anno[3]].append(anno) for filename, annos in annotation_dict.items(): path = self.basepath + '/audio/' + filename self._calc_log_probs_for_windows(path, annos, features, labels) return features, np.array(labels)
def read_test_files(self, annotation_file): ''' Read files for testing ''' features_test = [] labels_test = [] annotation_file = self.basepath + '/annotations/general/' + annotation_file annotations = self._read_annotations(annotation_file) annotation_dict = defaultdict(list) for anno in annotations: annotation_dict[anno[3]].append(anno) for filename, annos in annotation_dict.items(): path = self.basepath + '/audio/' + filename features, labels = self._read_test_windows(path, annos) features_test.extend(features) labels_test.extend(labels) return features_test, labels_test
def get_citation_positions(db, paper_id) : query = """SELECT r.paper_id, cg.start, cg.end FROM refs r JOIN citations c ON r.id=c.ref_id JOIN citation_groups cg ON c.group_id=cg.id WHERE cited_paper_id='%s' """ % paper_id cursor = db.query(query) rows = cursor.fetchall() # Group citations by paper citations = defaultdict(list) for citing_paper, start, end in rows : citations[citing_paper].append((start, end)) return citations
def __getitem__(self, key): for mapping in self.maps: try: return mapping[key] # can't use 'key in mapping' with defaultdict except KeyError: pass return self.__missing__(key) # support subclasses that define __missing__
def find_vm_addr(trace): """ Find the virtual machine addr :param trace: instruction trace :return: virtual function start addr """ push_dict = defaultdict(lambda: 0) vm_func_dict = defaultdict(lambda: 0) # try to find the vm Segment via series of push commands, which identify the vm_addr also for line in trace: try: if line.disasm[0] == 'push': push_dict[GetFunctionAttr(line.addr, FUNCATTR_START)] += 1 except: pass vm_func = max(push_dict, key=push_dict.get) vm_seg_start = SegStart(vm_func) vm_seg_end = SegEnd(vm_func) # test wheather the vm_func is the biggest func in the Segment vm_funcs = Functions(vm_seg_start, vm_seg_end) for f in vm_funcs: vm_func_dict[f] = GetFunctionAttr(f, FUNCATTR_END) - GetFunctionAttr(f, FUNCATTR_START) if max(vm_func_dict, key=vm_func_dict.get) != vm_func: return AskAddr(vm_func, "Found two possible addresses for the VM function start address: %s and %s. Choose one!" % (vm_func, max(vm_func_dict, key=vm_func_dict.get))) else: return vm_func
def find_virtual_regs(trace, manual=False, update=None): """ Maps the virtual registers on the stack to the actual registers after the vm exit. :param trace: instruction trace :return: virtual registers dict which maps the real regs onto virtual ones via stack addresses """ vmr = get_vmr() assert isinstance(trace, Trace) virt_regs = defaultdict(lambda: False) # trace, vm_seg_start, vm_seg_end = extract_vm_segment(trace) while trace: try: elem = trace.pop(len(trace) - 1) if len(elem.disasm) > 0 and elem.disasm[0] == 'pop': opnd = elem.disasm[1] if get_reg_class(opnd) is None: # if not a register it is a mem_loc pass elif virt_regs[opnd]: pass else: # the context always shows the registers after the execution, so we nee the SP from the instruction before stack_addr = trace[len(trace) - 1].ctx[get_reg('rsp', trace.ctx_reg_size)] virt_regs[opnd] = stack_addr except: pass if update is not None: update.pbar_update(60) vmr.vm_stack_reg_mapping = virt_regs if manual: print ''.join('%s:%s\n' % (c, virt_regs[c]) for c in virt_regs.keys()) return virt_regs
def city_dialect_words(model, vocab, filename='./city_ranking.txt'): #load named entities ne_file = './dumps/ne_' + dataset_name + '.json' with codecs.open(ne_file, 'r', encoding='utf-8') as fout: NEs = json.load(fout) NEs = set(NEs['nes']) k = 200 with open('./data/cities.json', 'r') as fin: cities = json.load(fin) all_locs = np.array([[city['latitude'], city['longitude']] for city in cities]).astype('float32') all_probs = model.predict(all_locs) all_logprobs = np.log(all_probs) all_logprobs_mean = np.mean(all_logprobs, axis=0) city_dialectwords = defaultdict(list) cities = cities[0:200] for city in cities: name = city['city'] lat, lon = city['latitude'], city['longitude'] loc = np.array([[lat, lon]]).astype('float32') city_probs = model.predict(loc) city_logprobs = np.log(city_probs) normalized_city_logprobs = city_logprobs - all_logprobs_mean sorted_vocab_indices = np.argsort(normalized_city_logprobs) topwords = list(reversed(np.array(vocab)[sorted_vocab_indices][0].tolist()))[0:k] #check if a topword is a named entity add a star beside it dialect_words = [] for topword in topwords: if topword in NEs: topword = "NE_" + topword dialect_words.append(topword) city_dialectwords[name] = dialect_words #write the city_dialectwords to file with codecs.open(filename, 'w', encoding='utf-8') as fout: json.dump(city_dialectwords, fout, indent=4, sort_keys=True)
def read_files(self, annotations, channels): ''' Read all files in the datapath and create features_windows dictionary. @param annotations: Annotations as read from annotation file @param channels: 1D numpy array of channel indices to use. @return: A dictionary containing a feature matrix [windows x features] with the classnames as keys ''' if type(channels) == int or type(channels) == np.int64: channels = np.array([channels]) elif type(channels) == list: channels = np.array(channels) features_frames = {} for classname in self.classes: features_frames[classname] = [] features_windows = {} for classname in self.classes: features_windows[classname] = [] annotation_dict = defaultdict(list) for anno in annotations: annotation_dict[anno[3]].append(anno) for filename, annos in annotation_dict.items(): path = self.basepath + '/audio/' + filename self._read_windows(path, annos, features_windows, features_frames, channels) return features_windows, features_frames
def __init__(self): self.root = defaultdict()
def insertUtil(self, minHeap, word, duplicate): if self.root == None: self.root = defaultdict()
def read(self, fn): d = defaultdict(lambda: []) with open(fn) as fin: for line in fin: data = line.strip().split('\t') text, base_rel, rel = data[:3] args = data[3:] confidence = 1 curExtraction = Extraction(pred = rel, sent = text, confidence = float(confidence)) for arg in args: curExtraction.addArg(arg) d[text].append(curExtraction) self.oie = d
def gen_trace(self, trace_start=BeginEA(), trace_end=BADADDR): """ Generate trace for the loaded binary. :param trace_start: :param trace_end: :return: """ vmr = get_vmr() self.trace_init() # reset color heads = Heads(SegStart(ScreenEA()), SegEnd(ScreenEA())) for i in heads: SetColor(i, CIC_ITEM, 0xFFFFFF) # start exec RunTo(BeginEA()) event = GetDebuggerEvent(WFNE_SUSP, -1) # enable tracing EnableTracing(TRACE_STEP, 1) if vmr.sys_libs: pass event = GetDebuggerEvent(WFNE_ANY | WFNE_CONT, -1) while True: event = GetDebuggerEvent(WFNE_ANY, -1) addr = GetEventEa() # change color of executed line current_color = GetColor(addr, CIC_ITEM) new_color = self.get_new_color(current_color) SetColor(addr, CIC_ITEM, new_color) # break by exception if event <= 1: break # standardize the difference between ida_trace.txt files and generated trace files by debugger hook: # since dbg_trace returns the cpu context before the instruction execution and trace files the ctx after for line in self.trace: try: line.ctx = self.trace[self.trace.index(line) + 1].ctx except IndexError: line.ctx = defaultdict(lambda: '0') # return the trace, for population see dbg_trace() below msg('[*] Trace generated!\n') if vmr.extract_param: vmr.func_args = self.func_args for key in self.func_args.keys(): print 'Function %s call args:' % key, ''.join('%s, ' % arg for arg in self.func_args[key]).rstrip(', ') return self.trace
def dbg_trace(self, tid, ea): """ :param tid: :param ea: :return: """ vmr = get_vmr() try: if vmr.extract_param and GetDisasm(ea).__contains__('call'): run_var = 0 key = GetDisasm(ea).split('call')[1].strip() while True: # traverse trace backwards and get sequential push and mov params line = self.trace[-(run_var + 1)] if line.is_push and line.disasm_len == 2: try: self.func_args[key].add(line.ctx[get_reg(line.disasm[1], self.arch)]) except: self.func_args[key].add(line.disasm[1]) elif line.is_mov: try: self.func_args[key].add(line.ctx[get_reg(line.disasm[2], self.arch)]) except: self.func_args[key].add(line.disasm[2]) else: break run_var += 1 # TODO mmx xmmx ymmx # compute next ctx if self.arch == 32: self.ctx = defaultdict(lambda: '0', {'eax': self.convert(cpu.eax), 'ebx': self.convert(cpu.ebx), 'edx': self.convert(cpu.edx), 'ecx': self.convert(cpu.ecx), 'ebp': self.convert(cpu.ebp), 'esp': self.convert(cpu.esp), 'eip': self.convert(cpu.eip), 'edi': self.convert(cpu.edi), 'esi': self.convert(cpu.esi), 'cf': self.convert(cpu.cf), 'zf': self.convert(cpu.zf), 'sf': self.convert(cpu.sf), 'of': self.convert(cpu.of), 'pf': self.convert(cpu.pf), 'af': self.convert(cpu.af), 'tf': self.convert(cpu.tf), 'df': self.convert(cpu.df)}) elif self.arch == 64: self.ctx = defaultdict(lambda: '0', {'rax': self.convert(cpu.eax), 'rbx': self.convert(cpu.ebx), 'rdx': self.convert(cpu.edx), 'rcx': self.convert(cpu.ecx), 'rbp': self.convert(cpu.ebp), 'rsp': self.convert(cpu.esp), 'rip': self.convert(cpu.eip), 'edi': self.convert(cpu.edi), 'rsi': self.convert(cpu.rsi), 'r8': self.convert(cpu.r8), 'r9': self.convert(cpu.r9), 'r10': self.convert(cpu.r10), 'r11': self.convert(cpu.r11), 'r12': self.convert(cpu.r12), 'r13': self.convert(cpu.r13), 'r14': self.convert(cpu.r14), 'r15': self.convert(cpu.r15), 'cf': self.convert(cpu.cf), 'zf': self.convert(cpu.zf), 'sf': self.convert(cpu.sf), 'of': self.convert(cpu.of), 'pf': self.convert(cpu.pf), 'af': self.convert(cpu.af), 'tf': self.convert(cpu.tf), 'df': self.convert(cpu.df)}) self.trace.append(Traceline(thread_id=tid, addr=ea, disasm=self.disconv(GetDisasm(ea)), ctx=deepcopy(self.ctx))) except Exception, e: print e.message # return values: # 1 - do not log this trace event; # 0 - log it return 0
def get_jar( self, idc = -1 ): """ Get the content of all files present in the JAR file stored in the field 9.184. The returned dictionnary contains the as follow:: { 'file name': 'file content', ... } The content of the files are not parsed, but returned as string value. :param idc: IDC value. :type idc: int :return: Content of all files stored in the JAR file. :rtype: dict """ idc = self.checkIDC( 9, idc ) data = self.get_field( "9.184", idc ) if data != None: data = base64.decodestring( data ) buffer = StringIO() buffer.write( data ) ret = defaultdict() with zipfile.ZipFile( buffer, "r" ) as zip: for f in zip.namelist(): name, _ = os.path.splitext( f ) with zip.open( f, "r" ) as fp: ret[ name ] = fp.read() return dict( ret ) else: return None ############################################################################ # # User defined fields # ############################################################################
def set_pairing( self, pairing = None, idc = -1, **options ): """ Function to set the pairing information in the User-defined field 9.255. The pairing information is stored as following: minutia id <US> minutia name <RS> ... :param pairing: Pairing information. :type pairing: AnnotationList Let the pairing information be defined as follow: >>> from NIST.fingerprint.functions import AnnotationList >>> data = [ ... ( '1', '1' ), # Minutiae '1' nammed '1' ... ( '2', '2' ), # Minutiae '2' nammed '2' ... ( '3', '3' ) # Minutiae '3' nammed '3' ... ] The pairing is set as follow: >>> mark2 = mark.get() >>> mark2.set_pairing( data ) The pairing can also be set with an AnnotationList object: >>> pairing = AnnotationList() >>> pairing.from_list( data, format = "in", type = "Pairing" ) >>> pairing # doctest: +NORMALIZE_WHITESPACE [ Pairing( i='1', n='1' ), Pairing( i='2', n='2' ), Pairing( i='3', n='3' ) ] The pairing is set as follow: >>> mark2.set_pairing( pairing ) """ if pairing != None: def n(): return None pai = defaultdict( n ) for p in pairing: try: if isinstance( p, Annotation ): i, n = p.i, p.n else: i, n = p pai[ int( i ) ] = int( n ) except: continue lst = [] for m in self.get_minutiae(): lst.append( ( m.i, pai[ int( m.i ) ] ) ) self.set_field( "9.255", join_r( [ US, RS ], lst ), idc )
def read(self): who = matlab.whosmat(self.filename) if not who: raise IOError("Couldn't load matlab file " + self.filename) else: ml = matlab.loadmat(self.filename, chars_as_strings=True) ml = {a: b for a, b in ml.items() if isinstance(b, np.ndarray)} # X is the biggest numeric array numarrays = [] for name, con in ml.items(): if issubclass(con.dtype.type, numbers.Number): numarrays.append((name, reduce(lambda x, y: x*y, con.shape, 1))) X = None if numarrays: nameX = max(numarrays, key=lambda x: x[1])[0] X = ml.pop(nameX) # find an array with compatible shapes attributes = [] if X is not None: nameattributes = None for name, con in ml.items(): if con.shape in [(X.shape[1],), (1, X.shape[1])]: nameattributes = name break attributenames = ml.pop(nameattributes).ravel() if nameattributes else range(X.shape[1]) attributenames = [str(a).strip() for a in attributenames] # strip because of numpy char array attributes = [ContinuousVariable.make(a) for a in attributenames] metas = [] metaattributes = [] sizemetas = None if X is None: counts = defaultdict(list) for name, con in ml.items(): counts[len(con)].append(name) if counts: sizemetas = max(counts.keys(), key=lambda x: len(counts[x])) else: sizemetas = len(X) if sizemetas: for name, con in ml.items(): if len(con) == sizemetas: metas.append(name) metadata = [] for m in sorted(metas): f = ml[m] metaattributes.append(StringVariable.make(m)) f.resize(sizemetas, 1) metadata.append(f) metadata = np.hstack(tuple(metadata)) domain = Domain(attributes, metas=metaattributes) if X is None: X = np.zeros((sizemetas, 0)) return Orange.data.Table.from_numpy(domain, X, Y=None, metas=metadata)