我们从Python开源项目中,提取了以下27个代码示例,用于说明如何使用nltk.bigrams()。
def tokenize(text): # text = NB.remove_punctuation(text) try: text = text.decode('utf-8').encode('ascii', 'replace').strip().lower() except: text = text.encode('ascii', 'replace').strip().lower() word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't biword = [b for b in nltk.bigrams(word)] triword = [t for t in nltk.trigrams(word)] # word = [w for w in word if w not in stopwords.words('english')] return word # triword
def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document) return features #//////////////////////////////////////////////////////////// #{ Helper Functions #////////////////////////////////////////////////////////////
def createbigramvocabulary(reviewfile, vocabfile): createvocabulary(reviewfile, vocabfile) finput = open(reviewfile,"r") foutput = open(vocabfile,"a") all_bigrams = [] for line in finput: tokenized_line = [] tokenized_line.append('*') tokenized_line.extend(word_tokenize(line[1:])) tokenized_line.append('$') bgrms = bigrams(tokenized_line) all_bigrams.extend(bgrms) c = Counter(all_bigrams) for b in c: if (b[0] != "+" and b[0] != "-" and c[b] >= 3): foutput.write(b[0] + " " + b[1] + "\n") finput.close() foutput.close()
def posTrigramsScore(trigrams,category,pos_tags_trigrams,labels): #keep pos tags bigrams of specific category trigrams_category = subList(pos_tags_trigrams,labels,category) #initialize dictionary d = {} #calculate score for every bigram for trigram in trigrams: d[trigram] = score(trigram,category,trigrams_category,pos_tags_trigrams) return d #calculate bigram's f1 score
def __init__(self,lexicon): #initialize two dictionaries (unigrams and bigrams) self.d_unigrams = {} self.d_bigrams = {} #select which lexicon to load if lexicon == 0 : self.loadHashtagLexicon1() elif lexicon == 1: self.loadHashtagLexicon2() elif lexicon == 2 : self.loadMaxDiffTwitterLexicon() elif lexicon == 3 : self.loadSentiment140Lexicon1() elif lexicon == 4 : self.loadSentiment140Lexicon2() elif lexicon == 5 : self.loadEmotionLexicon() else: print "Lexicon unavailable, please load another one" #HashtagSentimentAffLexNegLex
def loadUnigrams(self,path,reverse=False): f = open(path) for line in f.readlines(): line = line.decode('utf8') key = line.split("\t")[0] value = line.split("\t")[1] if reverse: self.d_unigrams[value]=float(key) else: self.d_unigrams[key]=float(value) f.close() #load bigrams lexicon
def score(self,tokens): total = 0.0 #score for unigrams for token in tokens: total += self.d_unigrams.get(token,0.0) #score for bigrams, if bigrams exist if len(self.d_bigrams)>0 : #list with bigrams of the message bigrams_list = Counter(list(bigrams(tokens))).keys() for bigram in bigrams_list : total += self.d_bigrams.get(bigram,0.0) return total #compute the number of tokens(words) that appear in the lexicon
def getTrigramsSet(pos_bigrams): s = set() for x in pos_bigrams: for bigram in x: s.add(bigram) return list(s) #calculate bigrams of every item of the list l
def getBigrams(l): b = [] for x in l: b.append(list(bigrams(x))) return b #calculate trigrams of every item of the list l
def posBigramsScore(bigrams,category,pos_tags_bigrams,labels): #keep pos tags bigrams of specific category bigrams_category = subList(pos_tags_bigrams,labels,category) #initialize dictionary d = {} #calculate score for every bigram for bigram in bigrams: d[bigram] = score(bigram,category,bigrams_category,pos_tags_bigrams) return d #calculate pos trigram score
def loadHashtagLexicon2(self): folder = "NRC-Hashtag-Sentiment-Lexicon-v0.1/" file1 = "unigrams-pmilexicon.txt" file2 = "bigrams-pmilexicon.txt" #clear previous dictionaries self.clearDictionaries() #load unigrams self.loadUnigrams(NRCLexicon.directory+folder+file1) #load bigrams self.loadBigrams(NRCLexicon.directory+folder+file2) #MaxDiff-Twitter-Lexicon
def loadMaxDiffTwitterLexicon(self): folder = "MaxDiff-Twitter-Lexicon/" file1 = "Maxdiff-Twitter-Lexicon_-1to1.txt" #clear previous dictionaries self.clearDictionaries() #load unigrams - reverse = true due to the .txt file format self.loadUnigrams(NRCLexicon.directory+folder+file1,True) #this lexicon has no bigrams so d_bigrams remains empty #Sentiment140AffLexNegLex
def loadSentiment140Lexicon1(self): folder = "Sentiment140AffLexNegLex/" file1 = "S140-AFFLEX-NEGLEX-unigrams.txt" file2 = "S140-AFFLEX-NEGLEX-bigrams.txt" #clear previous dictionaries self.clearDictionaries() #load unigrams self.loadUnigrams(NRCLexicon.directory+folder+file1) #load bigrams self.loadBigrams(NRCLexicon.directory+folder+file2) #Sentiment140-Lexicon-v0.1
def loadSentiment140Lexicon2(self): folder = "Sentiment140-Lexicon-v0.1/" file1 = "unigrams-pmilexicon.txt" file2 = "bigrams-pmilexicon.txt" #clear previous dictionaries self.clearDictionaries() #load unigrams self.loadUnigrams(NRCLexicon.directory+folder+file1) #load bigrams self.loadBigrams(NRCLexicon.directory+folder+file2) #NRC-Emotion-Lexicon-v0.92
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
def bigram_predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob): predicted_class = [] for review in testSet: negative_probab = math.log10(PN) positive_probab = math.log10(PP) review_words = [] review_words.append('*') review_words.extend(word_tokenize(review)) review_words.append('$') review_bigrams = bigrams(review_words) for w in review_bigrams: bigram = w w = w[0]+" " +w[1] if w in negative_probabilities and w in positive_probabilities: negative_probab = negative_probab + math.log10(negative_probabilities[w]) positive_probab = positive_probab + math.log10(positive_probabilities[w]) else: if bigram[0] in negative_probabilities and bigram[0] in positive_probabilities: #if(bigram[0] == '*'): # negative_probab = negative_probab # positive_probab = positive_probab #else: #if(negative_probabilities[bigram[0]] < 0 or positive_probabilities[bigram[0]] < 0): # print("issue with " + bigram[0] + " " + str(negative_probabilities[bigram[0]]) + " " + str(positive_probabilities[bigram[0]])) #if(negative_probabilities[bigram[0]] > 0 and positive_probabilities[bigram[0]] > 0): negative_probab = negative_probab + math.log10(negative_probabilities[bigram[0]]) positive_probab = positive_probab + math.log10(positive_probabilities[bigram[0]]) else: negative_probab = negative_probab + math.log10(unseen_neg_prob) positive_probab = positive_probab + math.log10(unseen_pos_prob) if(negative_probab > positive_probab): result = '-' else: result = '+' predicted_class.append(result) return predicted_class
def get_valid_bigram_words(self, words): _words = [] for i in nltk.bigrams(words): if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len): if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)): if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)): _words.append(" ".join(i)) return _words
def Markov_generate_unigram(seed): seed = ''.join(seed) counter = 1 next_word_list = [] for i in data: if seed == i: next_word_list.append(data[counter]) counter += 1 if len(next_word_list) == 0: return nltk.bigrams(["you", "are"]) cfdist = nltk.FreqDist(next_word_list) next_word = cfdist.max() return nltk.bigrams([seed, next_word])
def Markov_generate_bigrams(tuples): counter = 1 index_list = [] data_bigrams = nltk.bigrams(data) for i in data_bigrams: if tuples == i: index_list.append(data[counter+1]) counter += 1 return index_list
def calc_cfd(doc): # Calculate conditional frequency distribution of bigrams words = [w for w, t in Mecab().pos(doc)] bigrams = nltk.bigrams(words) return nltk.ConditionalFreqDist(bigrams)
def converse(raw_sentence): words_in_sent = raw_sentence.split() if len(words_in_sent) > 1: bigrams = nltk.bigrams(words_in_sent) else: bigrams = Markov_generate_unigram(words_in_sent) text_len = 20 generated_lines = [] for tuples in bigrams: line = [] line.append(''.join(tuples[0]).title()+" ") line.append(''.join(tuples[1])+" ") for i in range(text_len): next_words = Markov_generate_bigrams(tuples) if not next_words: break cfdist = nltk.FreqDist(next_words) next_word = cfdist.max() line.append(next_word+" ") new_tuple = (tuples[1], next_word) del tuples tuples = new_tuple generated_lines.append(line) longest_line = '' for line in generated_lines: stri = ''.join(line) if "." in stri: truncate_char = "." elif "?" in stri: truncate_char = "?" elif "!" in stri: truncate_char = "!" try: stri = stri[:stri.index(truncate_char)] except: pass if len(line) > len(longest_line): longest_line = stri.strip()+"." return longest_line