Python nltk 模块,stem() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.stem()

项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem):
    if punc:
        df[column] = df[column].apply(remove_punc).to_frame()
    if html:
        df[column] = df[column].apply(remove_html).to_frame()
    if email:
        df[column] = df[column].apply(remove_email).to_frame()
    if non_ascii:
        df[column] = df[column].apply(remove_non_ascii).to_frame()
    if stopwords:
        df[column] = df[column].apply(remove_stop).to_frame()
    if number:
        df[column] = df[column].apply(remove_numbers).to_frame()
    if nonenglish:
        df[column] = df[column].apply(nonenglish).to_frame()
    if stemorlem == 'stem':
        df[column] = df[column].apply(stemmer).to_frame()
    elif stemorlem == 'lem':
        df[column] = df[column].apply(lemmatizer).to_frame()

    return df
项目:goal    作者:victorskl    | 项目源码 | 文件源码
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
项目:goal    作者:victorskl    | 项目源码 | 文件源码
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
项目:disaster-mitigation    作者:varun-manjunath    | 项目源码 | 文件源码
def resource_similarity_score_via_word_net_1(need_res_set,offer_tweet_list):
    if len(need_res_set)==0:
        return 0
    value=0
    offer_res_list=[]
    for i in offer_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                offer_res_list.append(stemmer.stem(j.lower()))

    for word in need_res_set:
        temp= get_similarity_score_1(word,offer_res_list)
        if temp > 0.6:
            value=value+temp

    return value/len(need_res_set)
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __init__(self, string, stem=None, rating=1.0, proper=False,
                 terminal=False):
        '''
        @param string:   the actual representation of the tag
        @param stem:     the internal (usually stemmed) representation;
                         tags with the same stem are regarded as equal
        @param rating:   a measure of the tag's relevance in the interval [0,1]
        @param proper:   whether the tag is a proper noun
        @param terminal: set to True if the tag is at the end of a phrase
                         (or anyway it cannot be logically merged to the
                         following one)

        @returns: a new L{Tag} object
        '''

        self.string  = string
        self.stem = stem or string
        self.rating = rating
        self.proper = proper
        self.terminal = terminal
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __init__(self, tail, head=None):
        '''
        @param tail: the L{Tag} object to add to the first part (head)
        @param head: the (eventually absent) L{MultiTag} to be extended

        @returns: a new L{MultiTag} object
        '''

        if not head:
            Tag.__init__(self, tail.string, tail.stem, tail.rating,
                         tail.proper, tail.terminal)
            self.size = 1
            self.subratings = [self.rating]
        else:
            self.string = ' '.join([head.string, tail.string])
            self.stem = ' '.join([head.stem, tail.stem])
            self.size = head.size + 1

            self.proper = (head.proper and tail.proper)
            self.terminal = tail.terminal

            self.subratings = head.subratings + [tail.rating]
            self.rating = self.combined_rating()
项目:PYSHA    作者:shafaypro    | 项目源码 | 文件源码
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words
项目:PYSHA    作者:shafaypro    | 项目源码 | 文件源码
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def stem(self,word,pos=u'n'):
        return self.lemmatize(word,pos)


########  Wrapper for all  of the popular stemmers ###########
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def __init__(self,stemmer_type):
        self.stemmer_type = stemmer_type
        if (self.stemmer_type == 'porter'):
            self.stemmer = nltk.stem.PorterStemmer()
        elif (self.stemmer_type == 'snowball'):
            self.stemmer = nltk.stem.SnowballStemmer('english')
        elif (self.stemmer_type == 'lemmatize'):
            self.stemmer = WordNetStemmer()
        else:
            raise NameError("'"+stemmer_type +"'" + " not supported")



######## Simple wordreplacer object using a dictionary  ############
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def normalize(self, text):
        return [self.stemmer.stem(token) 
                for token in self.tokenizer.tokenize(text.lower()) 
                if token not in self.stop_words]

######### defining a default normalizer ##########
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))



########## Stemmer + CountVectorizer wrapper #############
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))


########## Defaults TF-IDF & Count Vectorizers ########


#======== TF-IDF Vectorizer =========#
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def Stem(self):
        #????
        fin = open('../file/pos_signature.txt', 'r')
        fout = open('../file/stem_signature.txt', 'w+')
        while True:
            line = fin.readline()
            if line:
                if '***' in line:
                    fout.write(line)
                elif '---------' in line:
                    fout.write(line)
                else:
                    num, line = line.split(':', 1)
                    line = self.RemSingleWord(line)#???????
                    line = self.CleanStopWords(line)#????
                    line = self.CleanLines(line)#???
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    line = ' '.join(word_list)
                    fout.write(num + ':' + line + '\n')
            else:
                break
项目:MOQA    作者:pprakhar30    | 项目源码 | 文件源码
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):

        self.itemId         = itemId
        self.questionType   = questionType
        self.answerType     = answerType
        self.question       = question
        self.answer         = answer
        self.Question       = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
        self.Answer         = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
        self.qFeature       = {}
        self.aFeature       = {}
        self.create_QAFeature()
项目:MOQA    作者:pprakhar30    | 项目源码 | 文件源码
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):

        self.itemId     = itemId
        self.sent   = Review
        self.rObj   = ReviewObj
        self.Sent   = [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
        self.sFeature   = {}
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def get_lemma_sentences(sentences):
    lemma_sentences = []
    for s in sentences:
        words = [w for w in nltk.word_tokenize(s) if w]
        w_s = [stemmer.stem(w) for w in words]
        l_s = ' '.join(w_s)
        lemma_sentences.append(l_s)
    return lemma_sentences
项目:hoot    作者:CatalystOfNostalgia    | 项目源码 | 文件源码
def tokenizeDocument(document):
    # remove punctuation (otherwise we have a bunch of empty tokens at the end)
    translate_table = dict((ord(char), " ") for char in string.punctuation)
    document = document.translate(translate_table)
    # tokenize
    tokenized_doc = nltk.word_tokenize(document)
    # stem
    snowball = stem.snowball.EnglishStemmer()
    tokenized_doc = [snowball.stem(word) for word in tokenized_doc]
    # remove stop words
    tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
    return tokenized_doc

# given the dictionary, return an array of all the tokenized comments
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def stemmer(text):
    # '''Description: This function takes in the string of descriptions and return string with all words stemmed
    #   Parameters: String of descriptions
    #   Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
    stemmer = PorterStemmer()
    lis = unicode(str(text), 'utf-8').split(" ")
    stemmed_words = [str(stemmer.stem(word)) for word in lis]

    return " ".join(stemmed_words)
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def extract_keywords(text):
    tokens = [i.lower() for i in nltk.word_tokenize(text) if i not in stop_words ]
    pos_tagged_tokens = nltk.pos_tag(tokens)
    result=[]
    for token in pos_tagged_tokens:
        # print token
        if token[1] in  POS_KEYS:
            result.append(token[0])

    return [ ps.stem(w) for w in result]
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def getKeywords(question):
  tagged = nltk.tag.pos_tag(question)
  tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux]
  return {ps.stem(tag[0]) for tag in tagged}

# Given a question, return a list of each sentence in the article
# with a score attached to it
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def score(question, sentence):
    score = 0
    sentence = map(ps.stem, sentence)
    keywords = getKeywords(question)
    question = map(ps.stem, question)
    score += proximity(keywords, sentence)
    question_ngrams = count_ngrams(question, MAX_NGRAMS, True)
    sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True)
    precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5)
    f1 = (2*precision*recall)/(precision+recall)
    score += 2*f1
    return score

# Finds the shortest window in the targest sentence
# in which all keywords appear, and assigns a score.
项目:negation-detection    作者:gkotsis    | 项目源码 | 文件源码
def _stem_(s):
    from nltk.stem.lancaster import LancasterStemmer
    rs = LancasterStemmer()
    rs = rs.stem(s)
    return rs
项目:negation-detection    作者:gkotsis    | 项目源码 | 文件源码
def _lemma_(token):

    if isinstance(token, str):
        return _stem_(token)
    if isinstance(token, unicode):
        return _stem_(token)
    from nltk.corpus import wordnet

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    p = get_wordnet_pos(token.pos()[0][1])
    if p!=wordnet.VERB:
        return _stem_(token[0])
    rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
    return rs
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def stem_text(text):
    from nltk.stem import LancasterStemmer
    ls = LancasterStemmer()
    tokens = tokenize_text(text)
    filtered_tokens = [ls.stem(token) for token in tokens]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def stem_str(sen):
    sen = text.re.sub('[^a-zA-Z0-9]', ' ', sen)
    sen = nltk.word_tokenize(sen.lower())
    sen = map(snowball_stemmer.stem, sen)
    sen = map(wordnet_lemmatizer.lemmatize, sen)
    return (' '.join(sen)).lower()
项目:disaster-mitigation    作者:varun-manjunath    | 项目源码 | 文件源码
def get_list_1(need_tweet_list):
    need_res_set=[]
    for i in need_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                need_res_set.append(j.lower())
    return list(set(need_res_set))
项目:disaster-mitigation    作者:varun-manjunath    | 项目源码 | 文件源码
def get_list_2(need_tweet_list):
    need_res_set=[]
    for i in need_tweet_list:
        for j in i.split():
            if stem2.stem(j.lower()) not in lanc_stem_list:
                need_res_set.append(j.lower())
    return list(set(need_res_set))
项目:disaster-mitigation    作者:varun-manjunath    | 项目源码 | 文件源码
def get_set_1(need_tweet_list):
    need_res_set=set()
    for i in need_tweet_list:
        for j in i.split():
            if stemmer.stem(j.lower()) not in out_stem_list:
                need_res_set.add(stemmer.stem(j.lower()))
    return need_res_set
项目:disaster-mitigation    作者:varun-manjunath    | 项目源码 | 文件源码
def resource_similarity_score_via_exact_word_match_1(need_res_set,offer_tweet_list):
    if len(need_res_set)==0:
        return 0

    offer_res_set=set()
    for i in offer_tweet_list:
        for j in i.split():
            if j not in out_stem_list:
                offer_res_set.add(stemmer.stem(j.lower()))

    return(len(offer_res_set&need_res_set)/len(need_res_set))
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __eq__(self, other):
        return self.stem == other.stem
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __hash__(self):
        return hash(self.stem)
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __init__(self, stemmer=None):
        '''
        @param stemmer: an object or module with a 'stem' method (defaults to
                        stemming.porter2)

        @returns: a new L{Stemmer} object
        '''

        if not stemmer:
            from nltk.stem import PorterStemmer
            stemmer = PorterStemmer()
        self.stemmer = stemmer
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def rate_tags(self, tags):
        '''
        @param tags: a list of tags to be assigned a rating
        '''

        term_count = collections.Counter(tags)

        for t in tags:
            # rating of a single tag is term frequency * weight
            t.rating = term_count[t] / len(tags) * self.weights.get(t.stem, 1.0)
项目:Twitter-Sentiment    作者:igorbpf    | 项目源码 | 文件源码
def review_to_words(review):

    if isinstance(review, float):
        review = str(review).encode("utf-8")
    letters_only = re.sub("\W+", " ", review, flags=re.UNICODE)

    words = letters_only.lower().split()
    #nltk.data.path.append('./nltk_data/')
    #stops = set(nltk.corpus.stopwords.words("portuguese"))
    meaningful_words = words #[w for w in words if not w in stops]
    #stemmer = RSLPStemmer()
    meaningful_stemmed = meaningful_words #[stemmer.stem(w) for w in meaningful_words]
    return(" ".join(meaningful_stemmed))
项目:Statistics-for-Machine-Learning    作者:PacktPublishing    | 项目源码 | 文件源码
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word)>=3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    

    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')

    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text
项目:TextAsGraphClassification    作者:NightmareNyx    | 项目源码 | 文件源码
def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    if stopwords is not None:
        terms = [t for t in terms if t not in stopwords]
    if only_N_J is not None:  # include only nouns and verbs
        tagged = nltk.pos_tag(terms)
        terms = [t for t, pos in tagged if pos in tags]
    if lemmatize is not None:
        lem = WordNetLemmatizer()
        terms = [lem.lemmatize(t) for t in terms]
    if stem is not None:
        stem = PorterStemmer()
        terms = [stem.stem(t) for t in terms]
    return terms
项目:TextAsGraphClassification    作者:NightmareNyx    | 项目源码 | 文件源码
def extract_terms_from_file(file_location, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    with open(file_location, 'r', encoding='iso-8859-1') as doc:
        terms = []
        for line in doc:
            terms.extend(re.compile('\w+').findall(line.lower()))

        # terms = re.compile('\w+').findall(doc
        #                                   .read()
        #                                   .replace('\n', '')
        #                                   .lower())
        return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
项目:TextAsGraphClassification    作者:NightmareNyx    | 项目源码 | 文件源码
def extract_terms_from_sentence(sentence, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    terms = re.compile('\w+').findall(sentence.lower())
    return clean_terms(terms, stopwords, lemmatize, stem, only_N_J)
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def addToSentenceScore(question, sentence):

    score = 0

    questionSet = set()
    for item in question.split():
        questionSet.add(morpher.stem(item.replace("?","")))

    sentenceSet = set()
    for item in sentence.split():
        sentenceSet.add(morpher.stem(item.replace("?","")))

    jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))

    common = ' '.join(sentenceSet.intersection(questionSet))
    tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
    if tagCommon:
        for item in tagCommon:
            if 'VB' in item[1]:
                score += 6
            else:
                score += 3

    # Add sentence and score to a hashmap
    sentenceScore[sentence] = score + (jaccard * 10)
    return score

# PARSER TO TOKENIZE, REMOVE STOP WORDS, MORPHOLOGY, ADD TO SET
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def parser(line):
    tokLine = nltk.word_tokenize(line)
    keywords = list(set(tokLine) - set(stopwords))
    lineSet = set()
    for item in keywords:
        lineSet.add(morpher.stem(item.replace("?", "")))
    return lineSet


# WORD MATCH
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def worMatch(question, sentence):

    score = 0

    questionSet = set()
    for item in question.split():
        questionSet.add(morpher.stem(item.replace("?","")))

    sentenceSet = set()
    for item in sentence.split():
        sentenceSet.add(morpher.stem(item.replace("?","")))

    jaccard = float(len(questionSet.intersection(sentenceSet))) / float(len(questionSet.union(sentenceSet)))

    common = ' '.join(sentenceSet & questionSet)
    tagCommon = nltk.pos_tag(nltk.word_tokenize(common))
    if tagCommon:
        for item in tagCommon:
            if 'VB' in item[1]:
                score += 6
            else:
                score += 3

    return score + (jaccard * 10)



# GET INPUT FILE NAME
项目:PYSHA    作者:shafaypro    | 项目源码 | 文件源码
def steam_words(self, word):
        ps_obj = PorterStemmer()  # creating the port steamer
        steamed_word = ps_obj.stem(word)
        return steamed_word  # returns the steamed word to the main file .

    # Natural Language displaying setneces .
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def CleanReVerb(self):
        fin_seed = open('../file/seed_ReVerb.txt', 'r')
        fout_seed = open('../file/seed_ReVerb_clean.txt', 'w+')
        fin_signature = open('../file/signature_ReVerb.txt', 'r')
        fout_signature = open('../file/signature_ReVerb_clean.txt', 'w+')
        while True:
            line = fin_seed.readline()
            if line:
                if '***' in line:
                    fout_seed.write(line)
                else:
                    mark, line = line.split(':', 1)
                    line = self.CleanStopWords(line)#?????
                    #????
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    if len(word_list) > 0:
                        line = ' '.join(word_list)
                        fout_seed.write(mark + ':' + line + '\n')
            else:
                break
        while True:
            line = fin_signature.readline()
            if line:
                if '***' in line:
                    fout_signature.write(line)
                else:
                    mark, line = line.split(':', 1)
                    line = self.CleanStopWords(line)#?????
                    #????
                    line = line.split()
                    word_list = []
                    s = nltk.stem.SnowballStemmer('english')
                    for w in line:
                        w = s.stem(w)
                        word_list.append(w)
                    if len(word_list) > 0:
                        line = ' '.join(word_list)
                        fout_signature.write(mark + ':' + line + '\n')
            else:
                break
        fin_signature.close()
        fout_signature.close()
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __call__(self, tags):
        '''
        @param tags: a list of (preferably stemmed) tags

        @returns: a list of unique (multi)tags sorted by relevance
        '''
        # print tags
        self.rate_tags(tags)
        multitags = self.create_multitags(tags)

        # keep most frequent version of each tag
        clusters = collections.defaultdict(collections.Counter)
        proper = collections.defaultdict(int)
        ratings = collections.defaultdict(float)

        for t in multitags:
            clusters[t][t.string] += 1
            if t.proper:
                proper[t] += 1
                ratings[t] = max(ratings[t], t.rating)


        term_count = collections.Counter(multitags)

        for t, cnt in term_count.iteritems():
            t.string = clusters[t].most_common(1)[0][0]
            proper_freq = proper[t] / cnt
            if proper_freq >= 0.5:
                t.proper = True
                t.rating = ratings[t]

        # purge duplicates, one-character tags and stopwords
        unique_tags = set(t for t in term_count
                          if len(t.string) > 1 and t.rating > 0.0)
        # remove redundant tags
        for t, cnt in term_count.iteritems():
            words = t.stem.split()
            for l in xrange(1, len(words)):
                for i in xrange(len(words) - l + 1):
                    s = Tag(' '.join(words[i:i + l]))
                    relative_freq = cnt / term_count[s]
                    if ((relative_freq == 1.0 and t.proper) or
                        (relative_freq >= 0.5 and t.rating > 0.0)):
                        unique_tags.discard(s)
                    else:
                        unique_tags.discard(t)

        # print unique_tags
        return sorted(unique_tags)
项目:TextAsGraphClassification    作者:NightmareNyx    | 项目源码 | 文件源码
def docs_to_networkx(dataset, cats, window_size=2, vocabulary_creation=True):
    ds = './datasets/%s/' % dataset
    Gs = []
    labels = []
    type_ = 2
    vocab_creation = vocabulary_creation
    words = []  # for vocabulary

    for doc in os.listdir(ds):
        if 'train.txt' in doc:
            type_ = 1

    if type_ == 1:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        with open(ds + '/train.txt', 'r', encoding='iso-8859-1') as doc:
            dc = 1
            for line in doc:
                label = line[0]
                labels.append(label)
                terms = extract_terms_from_sentence(line[1:],
                                                    stopwords=stopwords.words('english'),
                                                    lemmatize=True,
                                                    stem=True,
                                                    only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=label + '_' + str(dc))
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, 'label', dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                dc += 1
    else:
        if os.path.exists("ds/vocab.txt"):
            vocab_creation = False
        for cat in cats.keys():
            for doc in os.listdir(ds + cat):
                terms = extract_terms_from_file(ds + cat + '/' + doc,
                                                stopwords=stopwords.words('english'),
                                                lemmatize=True,
                                                stem=True,
                                                only_N_J=True)
                if vocab_creation:
                    words.extend(terms)
                graph = terms_to_graph(terms, window_size)
                G = graph_to_networkx(graph, name=cat + doc.split('.')[0])
                # G = nx.convert_node_labels_to_integers(G, first_label=1, label_attribute='label')
                nx.set_node_attributes(G, name='label', values=dict(zip(G.nodes(), G.nodes())))
                Gs.append(G)
                labels.append(cats[cat])

    if vocab_creation:
        vocab = dict(Counter(words))
        create_vocabulary_file(fname, vocab)

    return Gs, labels


# needs fix or discard