我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用collections.Counter()。
def keywords(self, num=5): words_only = self.strip_tags(self.content, strip_punctuation=True) words = words_only.split() counter = collections.Counter(words) common = counter.most_common() keywords = [] INSIGNIFICANT_WORDS = ('should', 'which', 'therefore') for word in common: lower_word = word[0].lower() if len(lower_word) > 4 and lower_word not in INSIGNIFICANT_WORDS: keywords.append(lower_word) if len(keywords) >= num: break return ", ".join(keywords)
def overlap_score(q1, q2): """ q1, q2 are preprocessed sentences (strings) >>> overlap_score("a b", "a") 0.6666666666666666 """ c1 = Counter(q1.split()) c2 = Counter(q2.split()) c1c2 = c1 + c2 both = set(c1.keys()) both = both.intersection(c2.keys()) bothscore = float(sum(c1c2[x] for x in both)) mplusn = float(sum(c1c2.values())) score = bothscore / mplusn return score
def overlap_score(q1, q2): """ >>> overlap_score("fun", "real fun") 0.6666666666666666 >>> overlap_score(" ", " ") 0 """ q1count = Counter(q1.split()) q2count = Counter(q2.split()) both = set(q1count.keys()) both = both.intersection(q2count.keys()) combined = q1count + q2count mplusn = float(sum(combined.values())) overlap = float(sum(combined[x] for x in both)) try: return overlap / mplusn except ZeroDivisionError: return 0
def vote(df, columns_name, value): label_data = df.loc[df[columns_name] == value, 'label'].values return Counter(label_data).most_common()[0][0]
def update_xpos(self, force=False): if self.must_update_xpos or force: try: # TODO: we should check the current mode instead. ============ sel = self.view.sel()[0] pos = sel.b if not sel.empty(): if sel.a < sel.b: pos -= 1 # ============================================================ r = sublime.Region(self.view.line(pos).a, pos) counter = Counter(self.view.substr(r)) tab_size = self.view.settings().get('tab_size') xpos = (self.view.rowcol(pos)[1] + ((counter['\t'] * tab_size) - counter['\t'])) except Exception as e: nvim.console_message(e) _logger.exception('error setting xpos; default to 0') self.xpos = 0 return else: self.xpos = xpos
def main(args): if args.minimum_frequency is None: minimum_frequency = max((len(args.tables) + 1) // 2, 2) else: minimum_frequency = args.minimum_frequency logger.info('Minimum frequency set to %s', minimum_frequency) # Read in tables tables = [] for path in args.tables: table = pd.read_csv(path, sep='\t') table = table[table.database_diff >= args.minimum_db_diff] table = table.dropna() tables.append(table) if len(table) == 0: logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff) # Count V sequence occurrences counter = Counter() for table in tables: counter.update(set(table.consensus)) # Find most frequent occurrences and print result print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t') for sequence, frequency in counter.most_common(): if frequency < minimum_frequency: break names = [] gene = None for table in tables: matching_rows = table[table.consensus == sequence] if matching_rows.empty: continue names.extend(matching_rows.name) if gene is None: row = matching_rows.iloc[0] gene = row.gene database_diff = row.database_diff #shm = row['V_SHM'] print(frequency, gene, database_diff, sequence, *names, sep='\t')
def main(args): if args.minimum_frequency is None: # args.table is a list of file names minimum_frequency = max((len(args.table) + 1) // 2, 2) else: minimum_frequency = args.minimum_frequency logger.info('Minimum frequency set to %s', minimum_frequency) # Read in tables tables = [] for path in args.table: table = read_table(path) table = table.loc[:,['V_gene', 'V_SHM', 'V_nt', 'name']] tables.append(table) # Count V sequence occurrences counter = Counter() for table in tables: counter.update(set(table.V_nt)) # Find most frequent occurrences and print result print('Frequency', 'Gene', '%SHM', 'Sequence', sep='\t') for sequence, frequency in counter.most_common(): if frequency < minimum_frequency: break names = [] gene = None for table in tables: matching_rows = table[table.V_nt == sequence] if matching_rows.empty: continue names.extend(matching_rows.name) if gene is None: row = matching_rows.iloc[0] gene = row['V_gene'] shm = row['V_SHM'] print(frequency, gene, shm, sequence, *names, sep='\t')
def __init__(self, dictionary=None, **kwargs): ''' :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset ''' self.dictionary = dictionary accepted_types = [ pd.Series, list, np.array, tuple ] def bag_of_words_transform_function(corpus): counter = Counter(corpus) for el in self.dictionary: if counter.get(el) is None: counter[el] = 0 return counter super(BagOfWordsTransformer, self).__init__(data_types=accepted_types, columns=None, transform_function=bag_of_words_transform_function)
def assertDifferentObjects(self, *objs): id_counts = Counter(map(id, objs)) ((most_common_id, count),) = id_counts.most_common(1) if count > 1: dupe = [o for o in objs if id(o) == most_common_id][0] self.fail("%s appeared %d times in %s" % (dupe, count, objs))
def calc_n_types(self) -> int: """Calculate the number of types of input text Returns: int: the number of types of input text """ surfaces = [] for sentence in self.sentences: juman_result = self.juman.analysis(sentence) surfaces += [mrph.midasi for mrph in juman_result.mrph_list()] word_type_counter = Counter(surfaces) return len(word_type_counter)
def calc_rs_modality(self) -> Dict[str, float]: modality_counter = Counter() for i, s in enumerate(self.sentences): chunks = [] for bnst in self.knp.parse(s).bnst_list(): chunk = Chunk(chunk_id=bnst.bnst_id, link=bnst.parent, description=bnst.fstring) chunks.append(chunk) s = "".join([chunk.description for chunk in chunks]) ms = set(re.findall("<?????-(.+?)>", s)) modality_counter += Counter(ms) n = len(self.sentences) return dict([(k, float(c) / n) for k, c in modality_counter.items()])
def GetDuplicateColumnNames( self, columns: sql_query_column_model.SQLColumnModel) -> [str]: """Find out if the query has duplicate column names and if a alias is needed. Args: columns (sql_query_column_model.SQLColumnModel): all columns parsed from the cursor Returns: [str]: a list of all the duplicate column names, if its empty it means it is a distinct list of columns """ single_column_name_list = [column.sql_column for column in columns] duplicate_list = [column for column, count in collections.Counter(single_column_name_list).items() if count > 1] return sorted(duplicate_list)
def _build_vocab(self, filename): counts = Counter() with tf.gfile.GFile(filename, "r") as f: #for line in f: # words = line.replace("\n"," ").split() # counts += Counter(words) while True: chunk = f.read(int(500000000/2)) if not chunk: break counts += Counter(chunk.replace("\n", " ").split()) sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0])) self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)} self.word_to_id[EOS] = IEOS self.word_to_id[BOS] = IBOS self.word_to_id[PAD] = IPAD
def print_params(self, cgs): """ cgs : list of computational graph names """ for name, cg in cgs.iteritems(): shapes = [param.get_value().shape for param in cg.parameters] logger.info( "Parameter shapes for computation graph[{}]".format(name)) for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info( "Total number of parameters for computation graph[{}]: {}" .format(name, len(shapes))) logger.info( "Parameter names for computation graph[{}]: ".format(name)) for item in cg.parameters: logger.info( " {:15}: {}".format(item.get_value().shape, item.name)) logger.info( "Total number of parameters for computation graph[{}]: {}" .format(name, len(cg.parameters)))
def get_manuscript_stats(text, citation_df): """ Compute manuscript statistics. """ stats = collections.OrderedDict() # Number of distinct references by type ref_counts = ( citation_df .standard_citation .drop_duplicates() .map(lambda x: x.split(':')[0]) .pipe(collections.Counter) ) ref_counts['total'] = sum(ref_counts.values()) stats['reference_counts'] = ref_counts stats['word_count'] = len(text.split()) logging.info(f"Generated manscript stats:\n{json.dumps(stats, indent=2)}") return stats
def subset_glyphs(self, s): table = self.table.Baseline if table.Format in (1, 3): baselines = {glyph: table.BaselineValues.get(glyph, table.DefaultBaseline) for glyph in s.glyphs} if len(baselines) > 0: mostCommon, _cnt = Counter(baselines.values()).most_common(1)[0] table.DefaultBaseline = mostCommon baselines = {glyph: b for glyph, b in baselines.items() if b != mostCommon} if len(baselines) > 0: table.BaselineValues = baselines else: table.Format = {1: 0, 3: 2}[table.Format] del table.BaselineValues return True
def subset_glyphs(self, s): prop = self.table.GlyphProperties if prop.Format == 0: return prop.DefaultProperties != 0 elif prop.Format == 1: prop.Properties = {g: prop.Properties.get(g, prop.DefaultProperties) for g in s.glyphs} mostCommon, _cnt = Counter(prop.Properties.values()).most_common(1)[0] prop.DefaultProperties = mostCommon prop.Properties = {g: prop for g, prop in prop.Properties.items() if prop != mostCommon} if len(prop.Properties) == 0: del prop.Properties prop.Format = 0 return prop.DefaultProperties != 0 return True else: assert False, "unknown 'prop' format %s" % prop.Format
def build_vocab(train_data, test_data): counter = collections.Counter() for stories, questions, answers in [train_data, test_data]: for story in stories: for sent in story: for word in nltk.word_tokenize(sent): counter[word.lower()] += 1 for question in questions: for word in nltk.word_tokenize(question): counter[word.lower()] += 1 for answer in answers: for word in nltk.word_tokenize(answer): counter[word.lower()] += 1 # no OOV here because there are not too many words in dataset word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())} word2idx["PAD"] = 0 idx2word = {v:k for k, v in word2idx.items()} return word2idx, idx2word
def kmer_freq ( ref_str, k ): """ Walk through sequence and return k-mer counts plus a pseudocount of 1. """ ref_str = ref_str.upper() kmers = [] for seq in product("ATGC",repeat=k): kmers.append( "".join(seq) ) kmer_counts = Counter() for j in range( len(ref_str)-(k-1) ): motif = ref_str[j:j+k] kmer_counts[motif] += 1 # Combine forward and reverse complement motifs into one count combined_kmer = Counter() for kmer in kmers: kmer_rc = rev_comp_motif(kmer) if not combined_kmer.get(kmer_rc): combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1 return combined_kmer
def kmer_freq ( mode, ref_str, strand, opts ): ref_str = ref_str.upper() if strand==1: ref_str = ref_str[::-1] k = opts.comp_kmer kmers = [] for seq in product("ATGC",repeat=k): kmers.append( "".join(seq) ) kmer_counts = Counter() for j in range( len(ref_str)-(k-1) ): motif = ref_str[j:j+k] kmer_counts[motif] += 1 # Combine forward and reverse complement motifs into one count combined_kmer = Counter() for kmer in kmers: kmer_rc = motif_tools.rev_comp_motif(kmer) if not combined_kmer.get(kmer_rc): combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1 return combined_kmer
def get_class_weights2(y, smooth_factor=0): """ Returns the normalized weights for each class based on the frequencies of the samples :param smooth_factor: factor that smooths extremely uneven weights :param y: list of true labels (the labels must be hashable) :return: dictionary with the weight for each class """ counter = Counter(y) if smooth_factor > 0: p = max(counter.values()) * smooth_factor for k in counter.keys(): counter[k] += p majority = max(counter.values()) return {cls: float(majority / count) for cls, count in counter.items()}
def path_clean(path): rel_ents = path.split(' -> ') relations = [] entities = [] for idx, item in enumerate(rel_ents): if idx%2 == 0: relations.append(item) else: entities.append(item) entity_stats = Counter(entities).items() duplicate_ents = [item for item in entity_stats if item[1]!=1] duplicate_ents.sort(key = lambda x:x[1], reverse=True) for item in duplicate_ents: ent = item[0] ent_idx = [i for i, x in enumerate(rel_ents) if x == ent] if len(ent_idx)!=0: min_idx = min(ent_idx) max_idx = max(ent_idx) if min_idx!=max_idx: rel_ents = rel_ents[:min_idx] + rel_ents[max_idx:] return ' -> '.join(rel_ents)
def main(cli_args): if len(config["targets"]) == 0: exit("No target found; maybe you need to specify a Dactyl config file?") issues = check_all_pages(target=cli_args.target) if issues: num_issues = sum(len(p[1]) for p in issues) print("Found %d issues:" % num_issues) for pagename,issuelist in issues: print("Page: %s" % pagename) c = collections.Counter(issuelist) for i, count_i in c.items(): if i[0]=="Unplain Phrase": print(" Discouraged phrase: %s (%d instances); suggest '%s' instead." % ( i[1], count_i, config["disallowed_phrases"][i[1].lower()] )) elif i[0]=="Unplain Word": print(" Discouraged word: %s (%d instances); suggest '%s' instead." % ( i[1], count_i, config["disallowed_words"][i[1].lower()] )) else: print(" %s: %s (%d instances)" % (i[0], i[1], count_i)) exit(1) else: print("Style check passed with flying colors!") exit(0)
def get_nb_caption_per_img(n, selected_captions): """ Get image id from audio caption file names that were selected by their speakers Choose images that have at least n captions per image ---------- n : int, desired number of caption per image selected_captions : list of string, list of caption file names selected by their speakers """ counter_nb_caption=Counter() for cap in selected_captions: #get image id ImgID = cap.split('_')[-0] # add a count counter_nb_caption[ImgID]+=1 #choose img_id that have a count of n d=dict((k, v) for k, v in counter_nb_caption.items() if v == n) ImgID_selected=d.keys() return(ImgID_selected)
def _f1_score(pred, answers): """Compute the F1 score.""" def _score(g_tokens, a_tokens): common = Counter(g_tokens) & Counter(a_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1. * num_same / len(g_tokens) recall = 1. * num_same / len(a_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 if pred is None or answers is None: return 0 g_tokens = _normalize_answer(pred).split() scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers] return max(scores)
def test2(): patient_data_paths = utils_lung.get_patient_data_paths(pathfinder.DATA_PATH) print len(patient_data_paths) pixel_spacings_xy = [] n_slices = [] for k, p in enumerate(patient_data_paths): pid = utils_lung.extract_pid_dir(p) sid2data, sid2metadata = utils_lung.get_patient_data(p) mtd = sid2metadata.itervalues().next() assert mtd['PixelSpacing'][0] == mtd['PixelSpacing'][1] pixel_spacings_xy.append(mtd['PixelSpacing'][0]) n_slices.append(len(sid2metadata)) print pid, pixel_spacings_xy[-1], n_slices[-1] print 'nslices', np.max(n_slices), np.min(n_slices), np.mean(n_slices) counts = collections.Counter(pixel_spacings_xy) new_list = sorted(pixel_spacings_xy, key=counts.get, reverse=True) print 'spacing', new_list
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]): X_train = unitmatrix(X_train) # normalize X_test = unitmatrix(X_test) score = X_test.dot(X_train.T) precisions = defaultdict(dict) label_counter = Counter(Y_test.tolist()) for idx in range(len(X_test)): retrieval_idx = score[idx].argsort()[::-1] for fr in fractions: ntop = int(fr * len(X_train)) pr = float(len([i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]])) / ntop try: precisions[fr][Y_test[idx]] += pr except: precisions[fr][Y_test[idx]] = pr new_pr = {} for fr, val in precisions.iteritems(): avg_pr = 0. for label, pr in val.iteritems(): avg_pr += pr / label_counter[label] new_pr[fr] = avg_pr / len(label_counter) return sorted(new_pr.items(), key=lambda d:d[0])
def cross_sentence(event_lemma_dict): """ function to create all possible pairs between event mentions in a file :param event_lemma_dict: dictionary of event lemmas in file :return: counter dictionary of event pairs in a file """ full_event_file = [] pairs_circumstantial_corpus = Counter([]) for k, v in event_lemma_dict.items(): full_event_file.append(k) event_pairs_full = list(product(full_event_file, repeat=2)) for i in event_pairs_full: pairs_circumstantial_corpus.update([i]) return pairs_circumstantial_corpus
def print_grouping(attributes, grouping, top): """ Print computed groups. :param attributes: list of grouped attributes :type: list(str) :param grouping: counter for each combination of attributes' values :type: Counter :type top: int """ total = sum(grouping.values()) table = Table(attributes + ['count', '%']) table.add_rows(total, grouping.most_common(top)) print '\n' + table.by_count() print 'Total:', total
def __init__(self): self.handlers = { 0x001: self._power, 0x186: self._text, 0x185: self._textparam, 0x061: self._exttemp, 0x005: self._tpms, #0x18e: self._textparam, 0x026: self._fuel, 0x053: self._gpsdate, 0x055: self._gps, } self.counter = Counter() self.locations = [] self.fuel = [0,0]
def fill_histogram(self, idf, columns): """Fill input histogram with column(s) of input dataframe :param idf: input data frame used for filling histogram :param list columns: histogram column(s) """ name = ':'.join(columns) if name not in self._counts: # create an (empty) value counts dict self._counts[name] = Counter() # value_counts() is faster than groupby().size(), but only works for series (1d). # else use groupby() for multi-dimensions g = idf.groupby(by=columns).size() if len(columns) > 1 else idf[columns[0]].value_counts() counts = Counter(g.to_dict()) # remove specific keys from histogram before merging, if so requested counts = self.drop_requested_keys(name, counts) self._counts[name].update(counts)
def test_bin_edges(self): # constructor cnt = Counter() for i in range(10): cnt[i*2] = i vc = ValueCounts(key='x', counts=cnt) bin_specs = { 'bin_width': 1, 'bin_offset': 0 } h = Histogram(vc, variable='x', bin_specs = bin_specs) # uniform bin_edges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] self.assertListEqual(h.get_uniform_bin_edges(), bin_edges) # truncated uniform bin edges truncated_bin_edges = [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0] self.assertListEqual(h.truncated_bin_edges([5.5,12.5]), truncated_bin_edges) h_bin_edges = h.bin_edges() self.assertIsInstance(h_bin_edges, np.ndarray) self.assertListEqual(h_bin_edges.tolist(), bin_edges)
def test_bin_centers(self): # constructor cnt = Counter() for i in range(10): cnt[i*2] = i vc = ValueCounts(key='x', counts=cnt) bin_specs = { 'bin_width': 1, 'bin_offset': 0 } h = Histogram(vc, variable='x', bin_specs = bin_specs) bin_centers = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5, 18.5] h_bin_centers = h.bin_centers() self.assertIsInstance(h_bin_centers, np.ndarray) self.assertListEqual(h_bin_centers.tolist(), bin_centers)
def test_bin_entries(self): # constructor cnt = Counter() for i in range(10): cnt[i*2] = i vc = ValueCounts(key='x', counts=cnt) bin_specs = { 'bin_width': 1, 'bin_offset': 0 } h = Histogram(vc, variable='x', bin_specs = bin_specs) bin_entries = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] h_bin_entries = h.bin_entries() self.assertIsInstance(h_bin_entries, np.ndarray) self.assertListEqual(h_bin_entries.tolist(), bin_entries)
def test_bin_labels(self): # constructor cnt = Counter() for i in range(10): cnt[i*2] = i vc = ValueCounts(key='x', counts=cnt) bin_specs = { 'bin_width': 1, 'bin_offset': 0 } h = Histogram(vc, variable='x', bin_specs = bin_specs) bin_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] h_bin_labels = h.bin_labels() self.assertIsInstance(h_bin_labels, np.ndarray) self.assertListEqual(h_bin_labels.tolist(), bin_labels)
def build_vocabulary( words, max_size ): vocab_instances = 0 unique_counts = Counter(words) d = dict(unique_counts.most_common(cfg.vocabulary_size-2) ) vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) ) # start at 2 to leave room for padding & unknown pb = Progress_bar(len(d) - 1) for i, (key, value) in enumerate(vocabulary.items(), start=2): vocab_instances += value vocabulary[key] = i pb.tick() vocabulary[cfg.padding_char] = 0 vocabulary[cfg.placeholder_char] = 1 #reverse the vocbulary (for reverse lookup) rev_vocabulary = {v: k for k, v in vocabulary.items()} vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary) return vocab
def main(): args = docopt(""" Usage: counts2pmi.py <counts> """) counts_path = args['<counts>'] words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: count, word, context = line.strip().split() count = int(count) words[word] += count contexts[context] += count words = sorted(words.items(), key=lambda (x, y): y, reverse=True) contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign): """ Builds file with all lemma + POS pairs above certain frequency threshold. :param dtatcfdir: path to directory with dta tcf files :param freq_file: path to frequency file :param MIN_FREQ: frequency threshold :param join_sign: sign to join lemma + first char of POS """ # build frequency file from lemmas outputpath = freq_file print 'Building frequency file to ' + outputpath + "..." lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign)) frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count) with open(outputpath, 'w') as f_out: for lemma in frequent_lemmas: print >> f_out, lemma.encode('utf-8')
def _feature_most_common(self, results): """ Find the most common country name in ES/Geonames results Paramaters ---------- results: dict output of `query_geonames` Returns ------- most_common: str ISO code of most common country, or empty string if none """ try: country_count = Counter([i['country_code3'] for i in results['hits']['hits']]) most_common = country_count.most_common()[0][0] return most_common except IndexError: return "" except TypeError: return ""
def MP(candidate, references, n): """ calculate modified precision """ counts = Counter(ngrams(candidate, n)) if not counts: return 0 max_counts = {} for reference in references: reference_counts = Counter(ngrams(reference, n)) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items()) return sum(clipped_counts.values()) / sum(counts.values())
def overlap_score(q1, q2): """ >>> overlap_score("a b c", "a b") 0.8 >>> overlap_score(" ", " ") 0 """ c1 = Counter(q1.split()) c2 = Counter(q2.split()) numerator = 0 for word in c1: if word in c2: numerator += c1[word] for word in c2: if word in c1: numerator += c2[word] m = sum(c1.values()) n = sum(c2.values()) try: score = numerator / (m + n) except ZeroDivisionError: score = 0 return score
def get_category_stats(self): """Get a count of CheckState results for each category of checks. Ignore collection counts to avoid duplications""" flat_results = self.get_flattened_results() categories = list(set([x.category for x in flat_results])) metrics = {} for category in categories: metrics[category] = collections.Counter([ x.status for x in filter( lambda y: len(y.subchecks) == 0 and y.category==category, flat_results ) ]) return metrics
def check_list_field_for_row( self, row=None, field_name=None, expected_list=None): found_list = getattr(row, field_name) self.assertEqual(Counter(expected_list), Counter(found_list))
def convert_uasts(self, file_uast_generator): for file_uast in file_uast_generator: print("-" * 20 + " " + str(file_uast.filepath)) id_cnt = Counter() self.collect_id_cnt(file_uast.response.uast, id_cnt) print(id_cnt)
def fetch_all_transitions(self, language, ngram_length): """ Generate a dict of counts for transitions for all n-grams in the language word list """ wordlist = os.path.join(os.path.dirname(__file__), "wordlists/{0}.txt".format(language)) if not os.path.exists(wordlist): raise SystemError("Language '{0}' does not exist".format(language)) all_grams = [] with codecs.open(wordlist, 'r', encoding='utf-8') as f: for line in f: words = line.strip('\n').lower().split() ngrams = reduce(lambda x, y: x + y, map(lambda word: self.find_ngrams(word, ngram_length), words)) all_grams += ngrams return dict(Counter(all_grams))
def _build_vocab(self, file_path, vocab_path): counter = Counter(self._read_text(file_path).split()) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) self.vocab = dict(zip(words, range(len(words)))) save_pkl(vocab_path, self.vocab)
def log_profiling_stats(): logger.info('-----------------------------------------------------------') logger.info('Series:') for name, series in sorted(SERIES.items()): logger.info(' {}: {}'.format(name, ' '.join(map(str, series)))) logger.info('-----------------------------------------------------------') logger.info('Histograms:') for name, histogram in sorted(HISTOGRAMS.items()): logger.info('{: >10s} {}'.format('Count', name)) for value, count in sorted(histogram.items()): logger.info('{: >10d} {}'.format(count, value)) logger.info('-----------------------------------------------------------') logger.info('Counters:') logger.info('{: >10s} {}'.format('Count', 'Counter')) for name, count in sorted(COUNTERS.items()): logger.info('{: >10d} {}'.format(count, name)) logger.info('-----------------------------------------------------------') logger.info('Timers:') times = [(t.elapsed, t.count, f) for (f, t) in TIMERS.items()] times.sort(reverse=True, key=lambda x: x[0]) logger.info('{: >10} {: >10} {}'.format('Seconds', 'Calls', 'Function')) for time, count, name in times: logger.info('{: >10.3f} {: >10} {}'.format(time, count, name))
def _guess_cdr3_start(group): """ Return a guess for the CDR3 start within sequences in the given group """ return Counter(group.V_CDR3_start).most_common()[0][0]
def calc_rs_pos(self) -> Dict[str, float]: """Calculate the ratio of each pos of words in input text Returns: float: the ratio of each pos of words in input text """ pos = [] # TODO: It may take a long time when the number of sentences are large for sentence in self.sentences: juman_result = self.juman.analysis(sentence) pos += [mrph.hinsi for mrph in juman_result.mrph_list()] pos_counter = Counter(pos) total = sum(pos_counter.values()) return {name: float(num) / total for name, num in pos_counter.items()}
def __init__(self, **kwargs): Metric.__init__(self, **kwargs) self.d = collections.Counter()