Python nltk 模块，corpus() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用nltk.corpus()。

项目：word-classification 作者：vinsis | 项目源码 | 文件源码

def createData():
    spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3]
    enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3]
    jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())]
    jpwords = [a for a in set(jpwords) if len(a)>3]
    # minLen = min(len(enwords), len(spwords), len(jpwords))

    featuresets = \
        [(createTupleDict(w,numChars),'English') for w in enwords] + \
        [(createTupleDict(w,numChars),'Spanish') for w in spwords] + \
        [(createTupleDict(w,numChars),'Japanese') for w in jpwords]

    random.shuffle(featuresets)

    l=int(len(featuresets)*0.8)

    training_set = featuresets[:l]
    testing_set = featuresets[l:]
    return (training_set, testing_set)

项目：deep_throat 作者：wdbm | 项目源码 | 文件源码

def most_frequent_Brown_Corpus_words():
    import nltk
    import nltk.corpus
    words = []
    for word in nltk.corpus.brown.words():
        if word not in [
            ",",
            ".",
            "``",
            "''",
            ";",
            "?",
            "--",
            ")",
            "(",
            ":",
            "!"
        ]:
            words.append(word.lower())
    frequencies_words = nltk.FreqDist(words).most_common()
    words_most_frequent = [word[0] for word in frequencies_words]
    return words_most_frequent

项目：PyRATA 作者：nicolashernandez | 项目源码 | 文件源码

def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

项目：PolBotCheck 作者：codeforfrankfurt | 项目源码 | 文件源码

def get_word_clouds(tweets, users, words_n=50, lang='english'):
    default_stopwords = set(nltk.corpus.stopwords.words(lang))
    stopwords_file = '../data/stopwords.txt'
    custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
    all_stopwords = default_stopwords | custom_stopwords

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
    X = vectorizer.fit_transform(tweets)
    terms = vectorizer.get_feature_names()

    word_cloud_per_person = {}
    for doc in range(len(tweets)):
        feature_index = X[doc, :].nonzero()[1]
        tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
        doc_terms = []
        for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
            doc_terms.append((word, score))
        important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
        word_cloud_per_person[users[doc]] = important_terms
    return word_cloud_per_person

项目：PolBotCheck 作者：codeforfrankfurt | 项目源码 | 文件源码

def get_corpus_of_most_active_users(n_users=5):
    tweets = []
    texts = []
    with open(DATASET_PATH) as f:
        for line in f:
            tweets.append(json.loads(line)['user']['screen_name'])
            texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))

    users = nltk.FreqDist(tweets).most_common(n_users)

    dict = {}
    for user, tweet in texts:
        if user in dict:
            dict[user] = " ".join([dict[user],tweet])
        else:
            dict[user] = tweet

    corpus = [dict[name] for name, _ in users]
    user_names = [name for name, _ in users]
    return  corpus, user_names

项目：topicModelling 作者：balikasg | 项目源码 | 文件源码

def cut_low_freq(self, corpus, threshold=1):
        new_vocas = []
        new_docfreq = []
        self.vocas_id = dict()
        conv_map = dict()
        for id, term in enumerate(self.vocas):
            freq = self.docfreq[id]
            if freq > threshold:
                new_id = len(new_vocas)
                self.vocas_id[term] = new_id
                new_vocas.append(term)
                new_docfreq.append(freq)
                conv_map[id] = new_id
        self.vocas = new_vocas
        self.docfreq = new_docfreq
        return np.array([ self.conv(doc, conv_map) for doc in corpus])

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

项目：MachineLearningProject 作者：ymynem | 项目源码 | 文件源码

def create_corpus(fileids, max_length=None):
    """
    Creates a corpus from fileids
    Removes stopwords and punctuation
    Returns a list of strings
    """
    sw = set(stopwords.words("english"))
    tokenizer = nltk.tokenize.RegexpTokenizer(r"[A-Za-z]+")
    corpus = []
    for doc in fileids:
        words = (w.lower() for w in tokenizer.tokenize(reuters.raw(doc)))
        words = [w for w in words if w not in sw]
        if max_length:
            words = words[:max_length]
        corpus.append(" ".join(words))
    return corpus

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger

项目：QuestionAnswerNLP 作者：debjyoti385 | 项目源码 | 文件源码

def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        print "I SHOULD BE PRINTED ONLY ONCE"
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not brown_word_counter.has_key(word):
                    brown_word_counter[word] = 0
                brown_word_counter[word] = brown_word_counter[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not brown_word_counter.has_key(lookup_word) else brown_word_counter[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger

项目：Semantic-Texual-Similarity-Toolkits 作者：rgtjf | 项目源码 | 文件源码

def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not word in brown_freqs:
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def normalize_corpus(corpus, lemmatize=True, tokenize=False):

    normalized_corpus = []  
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)

    return normalized_corpus

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):

    normalized_corpus = []    
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)

        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)

    return normalized_corpus