Python nltk.stem 模块,PorterStemmer() 实例源码

我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用nltk.stem.PorterStemmer()

项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def mk_feature():
    d=defaultdict(lambda:0)
    stoplist=stopwords.words("english")+[",",".","!","?",";",":","\n","\t","(",")"," ",""]
    stemmer=stem.PorterStemmer()
    l=list()

    for line in open("sentiment.txt","r"):
        y=line.split(" ")[0]
        for item in line.strip("\n").split(" ")[1:]:
            item=stemmer.stem(item)
            if item not in stoplist:
                d[item]+=1
    for key,value in d.items():
        if value < 5:
            l+=[key]
    for key in l:
        del d[key]
    return(d)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def make_feature(f, flag):
    with open(file_name) as f:
        feature = defaultdict(int)
        for i,line in enumerate(f):
            print(i)
            y,x = line.split('\t')
            y = int(y)
            words = x.split()
            for word in words:
                if stop_word_check(word) == False:
                    word = stem.PorterStemmer().stem(word)
                    if flag == 0:
                        feature[word] += 1
                    elif flag == 1:
                        if y == 1:
                            feature[word] += 1
                        elif y == -1:
                            feature[word] -= 1
    return feature
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'

## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
#    p = PorterStemmer()
#    if len(sys.argv) > 1:
#        for f in sys.argv[1:]:
#            with open(f, 'r') as infile:
#                while 1:
#                    w = infile.readline()
#                    if w == '':
#                        break
#                    w = w[:-1]
#                    print(p.stem(w))

##--NLTK--
## Added a demo() function
项目:allennlp    作者:allenai    | 项目源码 | 文件源码
def __init__(self):
        self.stemmer = NltkPorterStemmer()
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:LyricsMoodClassifier    作者:valeriaalampi    | 项目源码 | 文件源码
def simple_stemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(w) for w in tokens]
    return stemmed_tokens
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def feature_making(sentence):
    stemming = stem.PorterStemmer()
    ans_list = list()
    for word in sentence:
        word = stemming.stem(word)
        if stop(word) == False:
            ans_list.append(word)
    return ans_list
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def extract_feature(sentence):
    features = defaultdict(lambda: 0)
    stemmer = stem.PorterStemmer()
    for word in sentence.split():
        if not include_stopword(word):
            features[stemmer.stem(word)] += 1
    return features
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def getFeature(word_list):
    stemmer = stem.LancasterStemmer()
    # stemmer2 = stem.PorterStemmer()
    feature = defaultdict(lambda: 0)
    for word in word_list:
        if not isStopWords(word):
            word_stem = stemmer.stem(word)
            feature[word_stem] += 1
    return dict(feature)
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def mk_label(line,d,thre):
    import math
    from nltk import stem
    stemmer=stem.PorterStemmer()

    score=0
    for item in line.strip("\n").split(" "):
        item=stemmer.stem(item)
        score+=d[item]
    p_pos=math.exp(score)/(1+math.exp(score))
    if p_pos > thre:
        return(line.split(" ")[0]+"\t+1\t"+str(p_pos))
    else:
        return(line.split(" ")[0]+"\t-1\t"+str(p_pos))
项目:PySummarizer    作者:musikalkemist    | 项目源码 | 文件源码
def _preprocess(self, text):
        """ Return a list of lists. Each list is a preprocessed sentence of 
            text in bag-of-words format."""

        stemmer = PorterStemmer()
        self._sents = sent_tokenize(text)
        # tokenize sentences
        word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
        # remove stop-words and stem words
        word_sents = [[stemmer.stem(word) for word in sent if 
            word not in self._stopwords] for sent in word_sents]
        return word_sents
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def stemmer(text):
    # '''Description: This function takes in the string of descriptions and return string with all words stemmed
    #   Parameters: String of descriptions
    #   Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
    stemmer = PorterStemmer()
    lis = unicode(str(text), 'utf-8').split(" ")
    stemmed_words = [str(stemmer.stem(word)) for word in lis]

    return " ".join(stemmed_words)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, mode=NLTK_EXTENSIONS):
        if mode not in (
            self.NLTK_EXTENSIONS,
            self.MARTIN_EXTENSIONS,
            self.ORIGINAL_ALGORITHM
        ):
            raise ValueError(
                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
                "PorterStemmer.MARTIN_EXTENSIONS, or "
                "PorterStemmer.ORIGINAL_ALGORITHM"
            )

        self.mode = mode

        if self.mode == self.NLTK_EXTENSIONS:
            # This is a table of irregular forms. It is quite short,
            # but still reflects the errors actually drawn to Martin
            # Porter's attention over a 20 year period!
            irregular_forms = {
                "sky" :     ["sky", "skies"],
                "die" :     ["dying"],
                "lie" :     ["lying"],
                "tie" :     ["tying"],
                "news" :    ["news"],
                "inning" :  ["innings", "inning"],
                "outing" :  ["outings", "outing"],
                "canning" : ["cannings", "canning"],
                "howe" :    ["howe"],
                "proceed" : ["proceed"],
                "exceed"  : ["exceed"],
                "succeed" : ["succeed"],
            }

            self.pool = {}
            for key in irregular_forms:
                for val in irregular_forms[key]:
                    self.pool[val] = key

        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __repr__(self):
        return '<PorterStemmer>'
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def feature(sentence):
    features = []
    stemmer = PorterStemmer()
    for word in sentence:
        stem_word = stemmer.stem(word)
        if stopwords(stem_word) == False:
            features.append(stem_word)
    return features
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def make_sentence_feature(sen, ids):
    sentence_feature = [0 for i in range(len(ids))]
    words = sen.split()
    for word in words:
        word = stem.PorterStemmer().stem(word)
        if word in ids:
            sentence_feature[ids[word]] += 1
    return sentence_feature
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def stemming_porter():
    for word_stem in separate_word():
        stemmer = stem.PorterStemmer()
        yield (word_stem,stemmer.stem(word_stem))
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __init__(self, stemmer=None):
        '''
        @param stemmer: an object or module with a 'stem' method (defaults to
                        stemming.porter2)

        @returns: a new L{Stemmer} object
        '''

        if not stemmer:
            from nltk.stem import PorterStemmer
            stemmer = PorterStemmer()
        self.stemmer = stemmer
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)

##--NLTK--
项目:Statistics-for-Machine-Learning    作者:PacktPublishing    | 项目源码 | 文件源码
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word)>=3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    

    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')

    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text
项目:TextAsGraphClassification    作者:NightmareNyx    | 项目源码 | 文件源码
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    if stopwords is not None:
        terms = [t for t in terms if t not in stopwords]
    if only_N_J is not None:  # include only nouns and verbs
        tagged = nltk.pos_tag(terms)
        terms = [t for t, pos in tagged if pos in tags]
    if lemmatize is not None:
        lem = WordNetLemmatizer()
        terms = [lem.lemmatize(t) for t in terms]
    if stem is not None:
        stem = PorterStemmer()
        terms = [stem.stem(t) for t in terms]
    return terms