我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.corpus()。
def createData(): spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3] enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3] jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())] jpwords = [a for a in set(jpwords) if len(a)>3] # minLen = min(len(enwords), len(spwords), len(jpwords)) featuresets = \ [(createTupleDict(w,numChars),'English') for w in enwords] + \ [(createTupleDict(w,numChars),'Spanish') for w in spwords] + \ [(createTupleDict(w,numChars),'Japanese') for w in jpwords] random.shuffle(featuresets) l=int(len(featuresets)*0.8) training_set = featuresets[:l] testing_set = featuresets[l:] return (training_set, testing_set)
def most_frequent_Brown_Corpus_words(): import nltk import nltk.corpus words = [] for word in nltk.corpus.brown.words(): if word not in [ ",", ".", "``", "''", ";", "?", "--", ")", "(", ":", "!" ]: words.append(word.lower()) frequencies_words = nltk.FreqDist(words).most_common() words_most_frequent = [word[0] for word in frequencies_words] return words_most_frequent
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def get_word_clouds(tweets, users, words_n=50, lang='english'): default_stopwords = set(nltk.corpus.stopwords.words(lang)) stopwords_file = '../data/stopwords.txt' custom_stopwords = set(open(stopwords_file, 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords)) X = vectorizer.fit_transform(tweets) terms = vectorizer.get_feature_names() word_cloud_per_person = {} for doc in range(len(tweets)): feature_index = X[doc, :].nonzero()[1] tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index]) doc_terms = [] for word, score in [(terms[i], score) for (i, score) in tfidf_scores]: doc_terms.append((word, score)) important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n] word_cloud_per_person[users[doc]] = important_terms return word_cloud_per_person
def get_corpus_of_most_active_users(n_users=5): tweets = [] texts = [] with open(DATASET_PATH) as f: for line in f: tweets.append(json.loads(line)['user']['screen_name']) texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text'])) users = nltk.FreqDist(tweets).most_common(n_users) dict = {} for user, tweet in texts: if user in dict: dict[user] = " ".join([dict[user],tweet]) else: dict[user] = tweet corpus = [dict[name] for name, _ in users] user_names = [name for name, _ in users] return corpus, user_names
def cut_low_freq(self, corpus, threshold=1): new_vocas = [] new_docfreq = [] self.vocas_id = dict() conv_map = dict() for id, term in enumerate(self.vocas): freq = self.docfreq[id] if freq > threshold: new_id = len(new_vocas) self.vocas_id[term] = new_id new_vocas.append(term) new_docfreq.append(freq) conv_map[id] = new_id self.vocas = new_vocas self.docfreq = new_docfreq return np.array([ self.conv(doc, conv_map) for doc in corpus])
def closure(self, rel, depth=-1): """Return the transitive closure of source under the rel relationship, breadth-first >>> from nltk.corpus import wordnet as wn >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> list(dog.closure(hyp)) [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] """ from nltk.util import breadth_first synset_offsets = [] for synset in breadth_first(self, rel, depth): if synset._offset != self._offset: if synset._offset not in synset_offsets: synset_offsets.append(synset._offset) yield synset
def res_similarity(self, other, ic, verbose=False): """ Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. Synsets whose LCS is the root node of the taxonomy will have a score of 0 (e.g. N['dog'][0] and N['table'][0]). """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return lcs_ic
def lin_similarity(self, other, ic, verbose=False): """ Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects, in the range 0 to 1. """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return (2.0 * lcs_ic) / (ic1 + ic2)
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG #############################################
def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [rel for doc in conll2002.chunked_sents('esp.train') for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] for r in rels[:10]: print(clause(r, relsym='DE')) print()
def create_corpus(fileids, max_length=None): """ Creates a corpus from fileids Removes stopwords and punctuation Returns a list of strings """ sw = set(stopwords.words("english")) tokenizer = nltk.tokenize.RegexpTokenizer(r"[A-Za-z]+") corpus = [] for doc in fileids: words = (w.lower() for w in tokenizer.tokenize(reuters.raw(doc))) words = [w for w in words if w not in sw] if max_length: words = words[:max_length] corpus.append(" ".join(words)) return corpus
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger
def info_content(lookup_word): """ Uses the Brown corpus available in NLTK to calculate a Laplace smoothed frequency distribution of words, then uses this information to compute the information content of the lookup_word. """ global N if N == 0: # poor man's lazy evaluation print "I SHOULD BE PRINTED ONLY ONCE" for sent in brown.sents(): for word in sent: word = word.lower() if not brown_word_counter.has_key(word): brown_word_counter[word] = 0 brown_word_counter[word] = brown_word_counter[word] + 1 N = N + 1 lookup_word = lookup_word.lower() n = 0 if not brown_word_counter.has_key(lookup_word) else brown_word_counter[lookup_word] return 1.0 - (math.log(n + 1) / math.log(N + 1))
def info_content(lookup_word): """ Uses the Brown corpus available in NLTK to calculate a Laplace smoothed frequency distribution of words, then uses this information to compute the information content of the lookup_word. """ global N if N == 0: # poor man's lazy evaluation for sent in brown.sents(): for word in sent: word = word.lower() if not word in brown_freqs: brown_freqs[word] = 0 brown_freqs[word] = brown_freqs[word] + 1 N = N + 1 lookup_word = lookup_word.lower() n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word] return 1.0 - (math.log(n + 1) / math.log(N + 1))
def normalize_corpus(corpus, lemmatize=True, tokenize=False): normalized_corpus = [] for text in corpus: text = html_parser.unescape(text) text = expand_contractions(text, CONTRACTION_MAP) if lemmatize: text = lemmatize_text(text) else: text = text.lower() text = remove_special_characters(text) text = remove_stopwords(text) if tokenize: text = tokenize_text(text) normalized_corpus.append(text) else: normalized_corpus.append(text) return normalized_corpus
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False): normalized_corpus = [] for text in corpus: text = html_parser.unescape(text) text = expand_contractions(text, CONTRACTION_MAP) if lemmatize: text = lemmatize_text(text) else: text = text.lower() text = remove_special_characters(text) text = remove_stopwords(text) if only_text_chars: text = keep_text_characters(text) if tokenize: text = tokenize_text(text) normalized_corpus.append(text) else: normalized_corpus.append(text) return normalized_corpus
def preprocess(content): word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() words_set = [] for twitter in content: words_set += (word_tokenizer.tokenize(twitter['twitter_content'])) words_set = list(set(words_set)) stop_words = stopwords.words('english') non_words = list(punctuation) lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() # only need the alphabetic word formartted_twitter_words_set = [] for word in words_set: if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words): formartted_twitter_words_set.append(lemmatizer.lemmatize(word)) nltk_words_set = list(set(nltk.corpus.words.words())) # training whole set training_set = formartted_twitter_words_set + nltk_words_set return training_set
def store_synset_primarySense(word): result = {} check_item = sort_orderedDict(primary_sense(word.lower())) if len(check_item)==1: if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v': result[word] = wn.lemma_from_key(check_item.keys()[0]).synset() elif len(check_item)>1: for index in range(len(check_item.keys())): try: if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v': result[word] = wn.lemma_from_key(check_item.keys()[index]).synset() continue except nltk.corpus.reader.wordnet.WordNetError: continue else: pass else: return 0 return result #use the lemmatizer defined in the previous workshop
def documents(self, fold=None, train=False, test=False): """ A generator of documents being streamed from disk. Each document is a list of paragraphs, which are a list of sentences, which in turn is a list of tuples of (token, tag) pairs. All preprocessing is done by NLTK and the CorpusReader object this object wraps. If a fold is specified (should be an integer between 0 and folds), then the loader will return documents from that fold. Further, train or test must be specified to split the fold correctly. This method allows us to maintain the generator properties of document reads. """ for fileid in self.fileids(fold, train, test): yield list(self.corpus.tagged(fileids=fileid)) ########################################################################## ## Normalize Transformer ##########################################################################
def swoogle(query, termbool): extraselectors = [] if termbool is True: conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=100&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower()) else: conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=30&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower()) #relationSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+sys.argv[1]+'&pos=NN&N=100&sim_type=relation&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+sys.argv[1]) conceptSoup = BeautifulSoup(conceptSim.text) conceptTextArea = conceptSoup.findAll("textarea") conceptText = conceptTextArea[0].contents[0] lines = conceptText.split(",") for line in lines: line = line.strip() parts = line.split("_") extraselectors.append(parts[0]) return extraselectors
def add_more_sentences(self, corpuspath): """ Load sentences with relations from another corpus :param corpuspath: corpus path :return: """ nsentences = 0 for did in self.documents: nsentences += len(self.documents[did].sentences) print "base corpus has {} sentences".format(nsentences) corpus2 = pickle.load(open(corpuspath, 'rb')) nsentences = 0 for did in corpus2.documents: if did in self.documents: print "repeated did:", did else: self.documents[did] = corpus2.documents[did] nsentences += len(corpus2.documents[did].sentences) #for sentence in corpus2.documents[did].sentences: #if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]): # print "found sentence with relations:", sentence.sid #if len(sentence.entities.elist["goldstandard"]) > 1: #self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence]) print "added {} sentences".format(nsentences) self.save("corpora/Thaliana/seedev-extended.pickle")
def info_content(lookup_word): """ Uses the Brown corpus available in NLTK to calculate a Laplace smoothed frequency distribution of words, then uses this information to compute the information content of the lookup_word. """ global N if N == 0: # poor man's lazy evaluation for sent in brown.sents(): for word in sent: word = word.lower() if not brown_freqs.has_key(word): brown_freqs[word] = 0 brown_freqs[word] = brown_freqs[word] + 1 N = N + 1 lookup_word = lookup_word.lower() n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word] return 1.0 - (math.log(n + 1) / math.log(N + 1))
def createPopularWords(combined, lowerBound, upperBound): allWords = [] for message in combined: for word in message[0]: allWords.append(word) allWords = nltk.FreqDist(allWords) # grab the top several thousand words, ignoring the lowerBound most popular # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall popularWords = [] wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound] for pair in wordsToUse: popularWords.append(pair[0]) return popularWords # extract features from a single document in a consistent manner for all documents in a corpus # simply returns whether a given word in popularWords is included in the document