我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用heapq.nlargest()。
def extractFeatures(self, article, n, customStopWords=None): # pass in article as a tuple ( text, title) text = article[0] # extract the text title = article[1] # extract the title sentences = sent_tokenize(text) # split text into sentences word_sent = [word_tokenize(sentences.lower()) for a in sentences] # split sentences into words self._freq = self._compute_frequencies(word_sent, customStopWords) # calculate word freq using member func created above if n < 0: # how many features (words) to return - a -ve number means # no feature ( word) selection, just return all features return nlargest(len(self._freq_keys()), self._freq, key=self._freq.get) else: # here we say if calling e func has asked for a subset # then return only the 'n' largest features, i.e. the # most important words ( important == frequent, less stopwords) return nlargest(n, self._freq, key=self._freq.get)
def summarize(self, article, n): text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i, sentence in enumerate(word_sent): for word in sentence: if word in self._freq: ranking[i] += self._freq[word] sentences_index = nlargest(n, ranking, key=ranking.get) return [sentences[j] for j in sentences_index] ############################################################################## # TEST
def create_ranking2(edge_weight, k, adj, num): sink = len(adj) heaps = [[] for i in xrange(sink + 1)] heaps[0] = [(0, [])] for current in xrange(sink): for child in adj[current]: for length, path in heaps[current]: new_path = list(path) new_path.append(current) # this can be done better using this heapreplace ew = edge_weight[0, num[(current, child)]] heapq.heappush(heaps[child], (length + ew, new_path)) heaps[child] = heapq.nlargest(k, heaps[child]) # TODO what with equal lenght paths? # result: heaps[sink] return [(length, tuple(zip(nodes, nodes[1:] + [sink]))) for length, nodes in heaps[sink]]
def top(self, num, key=None): """ Get the top N elements from an RDD. .. note:: This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory. .. note:: It returns the list sorted in descending order. >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) [6, 5] >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str) [4, 3, 2] """ def topIterator(iterator): yield heapq.nlargest(num, iterator, key=key) def merge(a, b): return heapq.nlargest(num, a + b, key=key) return self.mapPartitions(topIterator).reduce(merge)
def nth_largest(n, iter_list): """``O(nlogn)`` time if ``n`` is median. Better if largest or smallest. Notes ----- Adopted and/or modified from reference(s): FogleBird on stackoverflow.com/questions/1034846/ """ length = len(iter_list) if n >= length: return heapq.nlargest(length, iter_list)[-1] return heapq.nlargest(n, iter_list)[-1] # OS utilities
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]: if not n > 0: raise ValueError("n must be > 0: %r" % (n,)) if not (0.0 <= cutoff <= 1.0): raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) result = [] s: SequenceMatcher = SequenceMatcher() s.set_seq2(word) for x in possibilities: s.set_seq1(x) if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: result.append((s.ratio(), x)) # Move the best scorers to head of list result = heapq.nlargest(n, result) # Strip scores for the best n matches return result
def build_dictionary(sentences, size): """ Create dictionary containing most frequent words in the sentences :param sentences: sequence of sentence that contains words Caution: the sequence might be exhausted after calling this function! :param size: size of dictionary you want :return: dictionary that maps word to index (starting from 1) """ dictionary = defaultdict(int) for sentence in sentences: for token in sentence: dictionary[token] += 1 frequent_pairs = nlargest(size, dictionary.items(), itemgetter(1)) words, frequencies = zip(*frequent_pairs) result = {word: index + 1 for index, word in enumerate(words)} return result
def keyphrases(self, N=20, fileids=None, categories=None): """ Returns the top N keyphrases grouped by document id. TODO: this currently ignores fileids/categories. """ if not self.tfidfs or not self.lexicon or not self.fileids: raise ValueError("Must call the score method first!") for idx, doc in enumerate(self.tfidfs): fileid = self.fileids[idx] # Get the top N terms by TF-IDF score scores = [ (self.lexicon[wid], score) for wid, score in heapq.nlargest(N, doc, key=itemgetter(1)) ] yield fileid, scores
def correlate_library(image, library, n_largest): """Correlates all simulated diffraction templates in a DiffractionLibrary with a particular experimental diffraction pattern (image) stored as a numpy array. """ i=0 out_arr = np.zeros((n_largest * len(library),5)) for key in library.keys(): if n_largest: pass else: n_largest=len(library[key]) correlations = dict() for orientation, diffraction_pattern in library[key].items(): correlation = correlate(image, diffraction_pattern) correlations[orientation] = correlation res = nlargest(n_largest, correlations.items(), key=itemgetter(1)) for j in np.arange(n_largest): out_arr[j + i*n_largest][0] = i out_arr[j + i*n_largest][1] = res[j][0][0] out_arr[j + i*n_largest][2] = res[j][0][1] out_arr[j + i*n_largest][3] = res[j][0][2] out_arr[j + i*n_largest][4] = res[j][1] i = i + 1 return out_arr
def determine_intent(self, utterance, num_results=1): """ Given an utterance, provide a valid intent. :param utterance: an ascii or unicode string representing natural language speech :param num_results: a maximum number of results to be returned. :return: A generator the yields dictionaries. """ intents = [] for domain in self.domains: gen = self.domains[domain].determine_intent(utterance=utterance, num_results=1) for intent in gen: intents.append(intent) heapq.nlargest( num_results, intents, key=lambda domain: domain['confidence']) for intent in intents: yield intent
def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs): # Compute the centrality scores for each vertex scores = metric(G, **kwargs) # Set the score as a property on each node nx.set_node_attributes(G, attr, scores) # Filter scores (do not include in book) ntypes = nx.get_node_attributes(G, 'type') phrases = [ item for item in scores.items() if ntypes.get(item[0], None) == "keyphrase" ] # Find the top n scores and print them along with their index topn = heapq.nlargest(n, phrases, key=itemgetter(1)) for idx, item in enumerate(topn): print("{}. {}: {:0.4f}".format(idx+1, *item)) return G
def newusers(self, ctx, *, count=5): """Tells you the newest members of the server. This is useful to check if any suspicious members have joined. The minimum is 3 members. If no number is given I'll show the last 5 members. """ human_delta = time.human_timedelta count = max(count, 3) members = heapq.nlargest(count, ctx.guild.members, key=attrgetter('joined_at')) names = map(str, members) values = ( (f'**Joined:** {human_delta(member.joined_at)}\n' f'**Created:** {human_delta(member.created_at)}\n{"-" * 40}') for member in members ) entries = zip(names, values) title = f'The {formats.pluralize(**{"newest members": len(members)})}' pages = EmbedFieldPages(ctx, entries, lines_per_page=5, colour=0x00FF00, title=title) await pages.interact()
def largest_export_versions(n): """Creates a filter that keeps the largest n export versions. Args: n: number of versions to keep. Returns: A filter function that keeps the n largest paths. """ def keep(paths): heap = [] for idx, path in enumerate(paths): if path.export_version is not None: heapq.heappush(heap, (path.export_version, idx)) keepers = [paths[i] for _, i in heapq.nlargest(n, heap)] return sorted(keepers) return keep
def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') x, y = self.switch(u, i) neighbors = [(self.sim[x, x2], r) for (x2, r) in self.yr[y]] k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0]) # compute weighted average sum_sim = sum_ratings = actual_k = 0 for (sim, r) in k_neighbors: if sim > 0: sum_sim += sim sum_ratings += sim * r actual_k += 1 if actual_k < self.min_k: raise PredictionImpossible('Not enough neighbors.') est = sum_ratings / sum_sim details = {'actual_k': actual_k} return est, details
def GetLeastNumbers(self, tinput, k): import heapq if tinput == None or len(tinput) < k or len(tinput) <= 0 or k <= 0: return [] output = [] for number in tinput: if len(output) < k: output.append(number) else: # ?????? ??? # output = heapq.nsmallest(k, output) # if number >= output[-1]: # continue # else: # output[-1] = number # ?????? ?? output = heapq.nlargest(k, output) if number >= output[0]: continue else: output[0] = number return output[::-1] # ???? return output
def findIDcnt(countours): #???????? widths = [] for idx, cnt in enumerate(countours): x, y, width, height = cv2.boundingRect(cnt) widths.insert(idx, width) #??????????? IDList = heapq.nlargest(3, widths) #??????????????????? IDcnts = [] for idx, item in enumerate(IDList): index = widths.index(item) IDcnts.insert(idx, countours[index]) # print IDcnts return IDcnts # ????
def make_submit(self, model, submit_file): data = self.eval_sets().values()[0] target_lines = list() answers = np.asarray([[idx] for idx in self.entity.keys()]) for i, d in enumerate(data): num_candidate = len(self.entity) index_entities = xrange(num_candidate) terms = d.split('\t') subjects = np.asarray([[terms[0]]] * num_candidate) relations = np.asarray([[terms[1]]] * num_candidate) sims = model.predict([subjects, relations, answers], batch_size=num_candidate).flatten() print(i) r = rankdata(sims, method='ordinal') index_candidates = nlargest(200, index_entities, key=lambda j: r[j]) one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates]) target_lines.append(one_line + '\n') submit_file.writelines(target_lines)
def make_submit_rt(self, model, submit_file): data = self.eval_sets_rt().values()[0] target_lines = list() answers = np.asarray([[idx] for idx in self.entity.keys()]) for i, d in enumerate(data): num_candidate = len(self.entity) index_entities = xrange(num_candidate) terms = d.split('\t') relations = np.asarray([[terms[0]]] * num_candidate) objects = np.asarray([[terms[1]]] * num_candidate) sims = model.predict_rt([answers, relations, objects], batch_size=num_candidate).flatten() print(i) r = rankdata(sims, method='ordinal') index_candidates = nlargest(200, index_entities, key=lambda j: r[j]) one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates]) target_lines.append(one_line + '\n') submit_file.writelines(target_lines)
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5): """Get a list of the best matches to a collection of choices. Convenience function for getting the choices with best scores. Args: query: A string to match against choices: A list or dictionary of choices, suitable for use with extract(). processor: Optional function for transforming choices before matching. See extract(). scorer: Scoring function for extract(). score_cutoff: Optional argument for score threshold. No matches with a score less than this number will be returned. Defaults to 0. limit: Optional maximum for the number of elements returned. Defaults to 5. Returns: A a list of (match, score) tuples. """ best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff) return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \ sorted(best_list, key=lambda i: i[1], reverse=True)
def top(self, num, key=None): """ Get the top N elements from a RDD. Note: It returns the list sorted in descending order. >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) [6, 5] >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str) [4, 3, 2] """ def topIterator(iterator): yield heapq.nlargest(num, iterator, key=key) def merge(a, b): return heapq.nlargest(num, a + b, key=key) return self.mapPartitions(topIterator).reduce(merge)
def _choose_vacant_home_or_vacant_lot(self): """Choose a vacant home to move into or a vacant lot to build on. Currently, a person scores all the vacant homes/lots in town and then selects one of the top three. TODO: Probabilistically select from all homes/lots using the scores to derive likelihoods of selecting each. """ home_and_lot_scores = self._rate_all_vacant_homes_and_vacant_lots() if len(home_and_lot_scores) >= 3: # Pick from top three top_three_choices = heapq.nlargest(3, home_and_lot_scores, key=home_and_lot_scores.get) if random.random() < 0.6: choice = top_three_choices[0] elif random.random() < 0.9: choice = top_three_choices[1] else: choice = top_three_choices[2] elif home_and_lot_scores: choice = list(home_and_lot_scores)[0] else: choice = None return choice
def _init_acquire_currently_occupied_lot(self): """If there are no vacant lots in town, acquire a lot and demolish the home currently on it.""" lot_scores = self._rate_all_occupied_lots() if len(lot_scores) >= 3: # Pick from top three top_three_choices = heapq.nlargest(3, lot_scores, key=lot_scores.get) if random.random() < 0.6: choice = top_three_choices[0] elif random.random() < 0.9: choice = top_three_choices[1] else: choice = top_three_choices[2] elif lot_scores: choice = max(lot_scores) else: raise Exception("A company attempted to secure an *occupied* lot in town but somehow could not.") return choice
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5): """Get a list of the best matches to a collection of choices. Convenience function for getting the choices with best scores. Args: query: A string to match against choices: A list or dictionary of choices, suitable for use with extract(). processor: Optional function for transforming choices before matching. See extract(). scorer: Scoring function for extract(). score_cutoff: Optional argument for score threshold. No matches with a score less than this number will be returned. Defaults to 0. limit: Optional maximum for the number of elements returned. Defaults to 5. Returns: A a list of (match, score) tuples. """ best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff) return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \ sorted(best_list, key=lambda i: i[1], reverse=True)
def most_common(self, n=None): '''List the n most common elements and their counts from the most common to the least. If n is None, then list all element counts. >>> Counter('abcdeabcdabcaba').most_common(3) [('a', 5), ('b', 4), ('c', 3)] ''' # Emulate Bag.sortedByCount from Smalltalk if n is None: return sorted(self.items(), key=_itemgetter(1), reverse=True) return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
def most_common(self, n=None): '''List the n most common elements and their counts from the most common to the least. If n is None, then list all element counts. >>> Counter('abcdeabcdabcaba').most_common(3) [('a', 5), ('b', 4), ('c', 3)] ''' # Emulate Bag.sortedByCount from Smalltalk if n is None: return sorted(self.iteritems(), key=_itemgetter(1), reverse=True) return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
def global_search(cls, text, limit, menu='ir.ui.menu'): """ Search on models for text including menu Returns a list of tuple (ratio, model, model_name, id, name, icon) The size of the list is limited to limit """ pool = Pool() ModelAccess = pool.get('ir.model.access') if not limit > 0: raise ValueError('limit must be > 0: %r' % (limit,)) models = cls.search(['OR', ('global_search_p', '=', True), ('model', '=', menu), ]) access = ModelAccess.get_access([m.model for m in models]) s = StringMatcher() if isinstance(text, str): text = text.decode('utf-8') s.set_seq2(text) def generate(): for model in models: if not access[model.model]['read']: continue Model = pool.get(model.model) if not hasattr(Model, 'search_global'): continue for record, name, icon in Model.search_global(text): if isinstance(name, str): name = name.decode('utf-8') s.set_seq1(name) yield (s.ratio(), model.model, model.rec_name, record.id, name, icon) return heapq.nlargest(int(limit), generate())
def closest(self, w, n=10): """ Assumes the vectors have been normalized. """ scores = self.m.dot(self.represent(w)) return heapq.nlargest(n, zip(scores, self.iw))
def closest_contexts(self, w, n=10): scores = self.ec.m.dot(self.ew.represent(w)) pairs = zip(scores, self.ec.iw)[1:] return heapq.nlargest(n, pairs)
def closest_contexts(self, w, n=10): """ Assumes the vectors have been normalized. """ scores = self.represent(w) return heapq.nlargest(n, zip(scores.data, [self.ic[i] for i in scores.indices]))
def closest(self, w, n=10): """ Assumes the vectors have been normalized. """ scores = self.m.dot(self.represent(w).T).T.tocsr() return heapq.nlargest(n, zip(scores.data, [self.iw[i] for i in scores.indices]))
def closest(self, w, n=10): """ Assumes the vectors have been normalized. """ if self.oov(w): return [] scores = self.m.dot(self.represent(w).T).T.tocsr() return heapq.nlargest(n, zip(scores.data, [self.iw[i] for i in scores.indices]))
def closest_first_order(self, w, n=10): if self.oov(w): return [] scores = self.m[self.wi[w], :] return heapq.nlargest(n, zip(scores.data, [self.iw[i] for i in scores.indices]))
def most_frequent_terms(self, fieldname, number=5, prefix=''): """Returns the top 'number' most frequent terms in the given field as a list of (frequency, text) tuples. """ gen = ((terminfo.weight(), text) for text, terminfo in self.iter_prefix(fieldname, prefix)) return nlargest(number, gen)
def most_distinctive_terms(self, fieldname, number=5, prefix=''): """Returns the top 'number' terms with the highest `tf*idf` scores as a list of (score, text) tuples. """ N = float(self.doc_count()) gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text) for text, terminfo in self.iter_prefix(fieldname, prefix)) return nlargest(number, gen)
def top_fragments(fragments, count, scorer, order, minscore=1): scored_fragments = ((scorer(f), f) for f in fragments) scored_fragments = nlargest(count, scored_fragments) best_fragments = [sf for score, sf in scored_fragments if score >= minscore] best_fragments.sort(key=order) return best_fragments
def create_ranking3(edge_weight, k, adj, num): sink = len(adj) EMPTY = -2 ROOT = -1 MIN_LENGTH = float('-inf') # heaps = [[(0, EMPTY, 0) for j in range(k)] for i in xrange(sink + 1)] heaps = [[(MIN_LENGTH, EMPTY, 0) for j in range(k + 1)] for i in xrange(sink + 1)] heaps[0][0] = (0, ROOT, 0) # forward for current in xrange(sink): new_rank = 0 for length, parent, rank in heaps[current]: if parent != EMPTY: for child in adj[current]: ew = edge_weight[0, num[(current, child)]] new_length = length + ew # heapq.heapreplace(heaps[child], (new_length, current, new_rank)) heapq.heappush(heaps[child], (new_length, current, new_rank)) heaps[child] = heapq.nlargest(k, heaps[child]) new_rank += 1 # backward ranking = [] for rank in xrange(k): path = [] current = sink current_rank = rank while current != ROOT: path.append(current) _, current, current_rank = heaps[current][current_rank] length, _, _ = heaps[sink][rank] path = list(reversed(path)) path = tuple(zip(path[:-1], path[1:])) ranking.append((length, path)) return ranking
def S_diff(lst): '''Given a list of int or float, calculate S_diff and S_point''' S_avg = sum(lst) / len(lst) S_dist = [i-S_avg for i in lst] #distance to average S_cum=[] #list of cumulative sum S_cum.append(0) for i in range(0,len(S_dist)): S_cum.append(S_cum[i] + S_dist[i]) return [nlargest(1,range(0,len(S_cum)),key=lambda i: S_cum[i]),(max(S_cum) - min(S_cum))] #return the index of maximum_diff index, and maximum_diff
def similar_to_vec(self, v, N=10): sims = self._vecs.dot(v) sims = heapq.nlargest(N, zip(sims,self._vocab,self._vecs)) return sims
def most_similar(self, word, N=10): w = self._vocab.index(word) sims = self._vecs.dot(self._vecs[w]) sims = heapq.nlargest(N, zip(sims,self._vocab)) return sims
def analogy(self, pos1, neg1, pos2,N=10,mult=True): wvecs, vocab = self._vecs, self._vocab p1 = vocab.index(pos1) p2 = vocab.index(pos2) n1 = vocab.index(neg1) if mult: p1,p2,n1 = [(1+wvecs.dot(wvecs[i]))/2 for i in (p1,p2,n1)] if N == 1: return max(((v,w) for v,w in izip((p1 * p2 / n1),vocab) if w not in [pos1,pos2,neg1])) return heapq.nlargest(N,((v,w) for v,w in izip((p1 * p2 / n1),vocab) if w not in [pos1,pos2,neg1])) else: p1,p2,n1 = [(wvecs.dot(wvecs[i])) for i in (p1,p2,n1)] if N == 1: return max(((v,w) for v,w in izip((p1 + p2 - n1),vocab) if w not in [pos1,pos2,neg1])) return heapq.nlargest(N,((v,w) for v,w in izip((p1 + p2 - n1),vocab) if w not in [pos1,pos2,neg1]))
def run(self): top_10 = nlargest(10, self._input_iterator()) with self.output().open('w') as out_file: for streams, artist in top_10: out_line = '\t'.join([ str(self.date_interval.date_a), str(self.date_interval.date_b), artist, str(streams) ]) out_file.write((out_line + '\n'))
def vec_to_str(subvec, max_n): sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1]) sub_strs = [' '.join([word, wf2ws(weight)]) for word, weight in sub_list_sorted] return '\t'.join(sub_strs)