我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用nltk.ngrams()。
def ngram_list(n, word_list, stop_word_list=None): """ Generate ngrams with width n excluding those that are entirely formed of stop words Args: n (int): i.e. 1, 2, 3... word_list (list of str): list of words stop_word_list (list of str, Optional): list of words that should be excluded while obtaining list of ngrams Returns: list of str: List of ngrams formed from the given word list except for those that have all their tokes in stop words list """ stop_word_set = set(stop_word_list) if stop_word_list else [] all_ngrams = nltk.ngrams(word_list, n) ngram_list = [] for ngram in all_ngrams: lowered_ngram_tokens = map(lambda token: token.lower(), ngram) if any(token not in stop_word_set for token in lowered_ngram_tokens): ngram_list.append(' '.join(ngram)) return ngram_list
def get(self,person_id): n=2 occurs=[] grams_arr=[] sixgrams = ngrams(str_read.split(), n) for grams in sixgrams: #print str(grams) x=NGram.compare('{}'.format(person_id),str(grams)) occurs.append(x) grams_arr.append(str(grams)) main_fields={'occurs':fields.String,"word":fields.String} datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])} x=marshal(datas,main_fields) #json.dumps(marshal(datas,main_fields)) return x
def get(self,person_id): n=2 occurs=[] grams_arr=[] sixgrams = ngrams(str_read.split(), n) for grams in sixgrams: #print str(grams) x=NGram.compare('{}'.format(person_id.decode('latin-1')),str(grams)) occurs.append(x) grams_arr.append(str(grams)) main_fields={'occurs':fields.String,"word":fields.String} datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])} x=marshal(datas,main_fields) #json.dumps(marshal(datas,main_fields)) return x
def extract_ngrams2(sentences, stemmer, language, N=2): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] N: Length of the n-grams e.g. 1, 2 return: a list of n-grams [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' ngrams_list = [] for sent in sentences: sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N)) for i, ngram in enumerate(ngram_items): ngram_str = ' '.join(ngram) ngrams_list.append(ngram_str) return ngrams_list
def extract_nuggets(sentences, nugget_type, language): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] return: a list of noun phrases, events, named_entities [('new', 'york'), ('york', 'is'), ('a', 'city'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' nugget_list = [] for sent in sentences: if nugget_type == 'n-grams': nugget_items = list(ngrams(sent2stokens(sent, language), 2)) if nugget_type == 'NP': nugget_items = get_phrases(sent, 'NP') if nugget_type == 'Phrases': nugget_items = get_phrases(sent, 'Phrases') if nugget_type == 'NE': nugget_items = get_phrases(sent, 'NE') for nugget in nugget_items: nugget_list.append(' '.join(nugget)) return nugget_list
def add_sentences(self, sentences): """ @type sentences: list[Sentence] """ counter = self.counter G = self.G for sent in sentences: counter.update(ngrams(sent.tokens, self.N)) G.add_nodes_from(sent.tokens) updated_edges = [] for v in counter.elements(): s = v[0] t = v[1] c = counter[v] updated_edges.append((s, t, c)) G.add_weighted_edges_from(updated_edges)
def words2ngrams(sep, num, tokens): '''Convert word tokens into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, num)) write_csv(ngrams, str(sep))
def text2ngrams(sep, num, text): '''Tokenize plain text into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = '\n'.join([open(f).read() for f in text]) try: tokens = nltk.word_tokenize(content) ngrams = list(nltk.ngrams(tokens, num)) write_csv(ngrams, str(sep)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err)
def __init__(self, body, author='Anonymous'): # accumulators hashtags = [] # Now process cleaned up text with NLTK words = [] bigrams = [] trigrams = [] quadgrams = [] sentences = [] words = word_tokenize(body) sentences.extend(sent_tokenize(body)) # Strip whitespace from each sentence sentences = [sentence.strip() for sentence in sentences] bigrams = ngrams(body, 2) trigrams = ngrams(body, 3) quadgrams = ngrams(body, 4) self.body = body self.words = words self.bigrams = bigrams self.trigrams = trigrams self.quadgrams = quadgrams self.sentences = sentences self.hashtags = hashtags self.author = author #TODO: Create "hashtags" from arbitrary number of rarest words
def build_ngrams(tokens, low, high): LOGGER.debug("Building ngrams from %d to %d" % (low, high)) assert low <= high assert low > 0 grams = {} for n in range(low, high + 1): grams[n] = [g for g in ngrams(tokens, n)] return grams
def build_pos_ngrams(tagged, low, high): LOGGER.debug("Building POS ngrams from %d to %d" % (low, high)) assert low <= high assert low > 0 pos_tokens = [] pos_words = defaultdict(list) for word, pos in tagged: pos_tokens.append(pos) pos_words[pos].append(word) grams = {} for n in range(low, high + 1): grams[n] = [g for g in ngrams(pos_tokens, n)] return grams, pos_words
def ngrams_extract(string): if random.random() < SAMPLE_RATE: print '[*]',string l = list grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5)) SIZE = 1024 vec = zeros((SIZE,)) for t in grams: vec[hash(t)%SIZE]+=1 return log(vec+1.0)
def get_word_ngrams(sequence, n=3): tokens = tokenize(sequence) return [' '.join(ngram) for ngram in ngrams(tokens, n)]
def gen_training_features(self, bodies_fpath, stances_fpath): print 'Generating training features' self._train_bodies, self._train_stances = self._read(bodies_fpath, stances_fpath, True) print 'Generating ngrams' ng_start = time.time() self._train_unigrams = self._gen_ngrams(1, self._train_bodies, self._train_stances) ng_end = time.time() print 'ngrams generation time: ', (ng_end - ng_start), 'seconds' print 'Generating jaccard similarities' js_start = time.time() self.train_avg_sims, self.train_max_sims = self._gen_jaccard_sims( self._train_bodies, self._train_stances ) js_end = time.time() print 'jaccard similarity generation time: ', (js_end - js_start), 'seconds' for i in range(len(self._train_stances)): labeled_feature = ({ 'unigrams':self._train_unigrams[i], 'avg_sims':self.train_avg_sims[i], 'max_sims':self.train_max_sims[i]}, self._train_stances[i]['Stance']) self._labeled_feature_set.append(labeled_feature)
def _get_ngrams(self, text, n): tokens = nltk.word_tokenize(text) tokens = [ token.lower() for token in tokens if len(token) > 1 ] return nltk.ngrams(tokens, n)
def _get_ngrams(self, text, n): tokens = nltk.word_tokenize(text) tokens = [ token.lower() for token in tokens if len(token) > 1 ] ngram_list = list(nltk.ngrams(tokens, n)) return ngram_list
def naive_bayes(analysis): tags = [] words = [] deps_cc = [] for sen in analysis["sentences"]: tags += sen['pos'] words += sen['tokens'] deps_cc += sen["deps_cc"] norm = normalize_title(tags, words) f1 = [] current = list(nltk.ngrams(norm.split(), 1)) + list(nltk.ngrams(norm.split(), 2)) + list(nltk.ngrams(norm.split(),3)) ngram_list = [' '.join(list(g)) for g in current] for pos in common_grams: if pos in ngram_list: f1.append(1) else: f1.append(0) f1 = numpy.array(f1).reshape(1, len(f1)) #pos ngrams f2 = [] current_pos = list(nltk.ngrams(tags, 1)) + list(nltk.ngrams(tags, 2)) + list(nltk.ngrams(tags,3)) ngram_list = [' '.join(list(g)) for g in current_pos] for pos in common_pos_grams: if pos in ngram_list: f2.append(1) else: f2.append(0) f2 = numpy.array(f2).reshape(1, len(f2)) # print f2.shape # syntactic ngrams f3 = [] current_sngrams = list(syntactic_n_gram(deps_cc, 1)) + list(syntactic_n_gram(deps_cc, 2)) + list(syntactic_n_gram(deps_cc, 3)) ngram_list = [' '.join(list(g)) for g in current_sngrams] for pos in common_sn_grams: if pos in ngram_list: f3.append(1) else: f3.append(0) f3 = numpy.array(f3).reshape(1, len(f3)) return [clf1.predict(f1)[0], clf2.predict(f2)[0], clf3.predict(f3)[0]]
def n_gram_analysis_simple(infile, gram, stop): ngram = dict() f = open(infile, "r" ) #f2 = codecs.open(outfile, "w+", "utf-8") for l in f: x = nltk.ngrams(l.split(),gram) for w in x: # if stop: # if w not in stops: # if w in ngram: # ngram[w]+=1 # else: # ngram[w]=1 if w in ngram: ngram[w] += 1 else: ngram[w] = 1 p = list(ngram.items()) p.sort(key = lambda x: -x[1]) print len(p) for x in p[:10]: sen = ' '.join(x[0]) cnt = int(x[1]) if cnt == 0: cnt = 1 print sen, cnt
def getNGrams(raw_string, gram_nb): xgrams = ngrams(raw_string.split(), gram_nb) return xgrams
def get(self,param_word): status=False n=2 occurs=[] grams_arr=[] words=[] for key in r_server.scan_iter(): words.append(key) #sixgrams = ngrams(str_read.split(), n) for keys in words: #print str(grams) x=NGram.compare('{}'.format(param_word.decode('latin-1')),str(keys)) occurs.append(x) grams_arr.append(str(keys)) for key in r_server.scan_iter(): if key == param_word: status=True if status is True: main_fields_true={"word":fields.String,"status":fields.Boolean} datas_true={'word':"{}".format(param_word),'status':status} x_true=marshal(datas_true,main_fields_true) return x_true else: main_fields_false={'occurs':fields.String,"word":fields.String,"freq":fields.String,"status":fields.Boolean} datas_false={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))]),'freq':r_server.get(param_word),'status':status} x_false=marshal(datas_false,main_fields_false) return x_false #json.dumps(marshal(datas,main_fields)) #if datas["status"]==True: # return datas["word"] #else:
def extract_ngrams(sentences, stoplist, stemmer, language, n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ concepts = [] for i, sentence in enumerate(sentences): # for each ngram of words tokens = sent2tokens(sentence, language) for j in range(len(tokens)-(n-1)): # initialize ngram container ngram = [] # for each token of the ngram for k in range(j, j+n): ngram.append(tokens[k].lower()) # do not consider ngrams containing punctuation marks marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)] if len(marks) > 0: continue # do not consider ngrams composed of only stopwords stops = [t for t in ngram if t in stoplist] if len(stops) == len(ngram): continue # stem the ngram ngram = [stemmer.stem(t) for t in ngram] # add the ngram to the concepts concepts.append(' '.join(ngram)) return concepts
def prune_ngrams(ngrams, stoplist, N=2): pruned_list = [] for ngram in ngrams: items = ngram.split(' ') i = 0 for item in items: if item in stoplist: i += 1 if i < N: pruned_list.append(ngram) return pruned_list
def get_tech(text): """Get all technologies from the top 1000 tags on StackOverflow. """ sentences = sent_tokenize(text) techs = set() for s in sentences: tokens = word_tokenize(s) techs |= set(tag for tag in tags if tag in tokens) bigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 2)] techs |= set(tag for tag in tags if tag in bigrams) trigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 3)] techs |= set(tag for tag in tags if tag in trigrams) return list(techs)