def generate_collocations(tokens): ''' Given list of tokens, return collocations. ''' ignored_words = nltk.corpus.stopwords.words('english') bigram_measures = nltk.collocations.BigramAssocMeasures() # Best results with window_size, freq_filter of: (2,1) (2,2) (5,1) finder = BigramCollocationFinder.from_words(tokens, window_size = 2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) finder.apply_freq_filter(1) colls = finder.nbest(bigram_measures.likelihood_ratio, 5) return colls
def extract_bigrams(self, text): text = self.remove_return_lines_and_quotes(text) bigrams = [] st = PorterStemmer() stop = stopwords.words('english') more_stop_words = [ '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...'] stop = stopwords.words('english') stop = stop + more_stop_words tokens = st.stem(text) tokens = nltk.word_tokenize(tokens.lower()) tokens = [i for i in tokens if i not in stop] tokens = [word for word in tokens if len(word) > 2] bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(2) top_bigrams = finder.nbest(bigram_measures.pmi, 1000) for bg in top_bigrams: bg = " ".join(bg) tag = nltk.pos_tag([bg])[0] if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']: bigrams.append(tag[0]) return bigrams
def get_bigram_likelihood(statements, freq_filter=3, nbest=200): """ Returns n (likelihood ratio) bi-grams from a group of documents :param statements: list of strings :param output_file: output path for saved file :param freq_filter: filter for # of appearances in bi-gram :param nbest: likelihood ratio for bi-grams """ words = list() print 'Generating word list...' #tokenize sentence into words for statement in statements: # remove non-words tokenizer = RegexpTokenizer(r'\w+') words.extend(tokenizer.tokenize(statement)) bigram_measures = nltk.collocations.BigramAssocMeasures() bigram_finder = BigramCollocationFinder.from_words(words) # only bi-grams that appear n+ times bigram_finder.apply_freq_filter(freq_filter) # TODO: use custom stop words bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english')) bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest) return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)
def display_collocations(articles): ''' Given list of article tuples (title, text, url), generates and PRINTS collocations (no return). ''' def clear_screen(): os.system('cls' if os.name == 'nt' else 'clear') total = len(articles) for i, article in enumerate(articles): title, text, url = article article_number = i + 1 if title != '': try: clear_screen() colls = generate_collocations(text) print("Article {}/{}".format(article_number, total)) print('---------------') print("{}\n".format(title)) print("Link: {}\n".format(url)) print("Topics:") output = "" for tup in colls: word1, word2 = tup output += "{} {}; ".format(word1, word2) print(output[:-2]) print('---------------\n') print("Press ENTER for next article or any key to exit.") user_input = raw_input("> ") if user_input: exit(0) except TypeError: continue else: continue clear_screen()