Python nltk.tokenize 模块，word_tokenize() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用nltk.tokenize.word_tokenize()。

项目：QProb 作者：quant-trade | 项目源码 | 文件源码

def keyword_extractor(data):
    try:
        #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
        #result = np_extractor.extract()
        text = words_wo_stopwords(strip_tags(data))

        #TODO this is duplicated job, should be improved
        words = word_tokenize(strip_tags(text))
        taggged = pos_tag(words)
        cleaned = filter_insignificant(taggged)
        text = " ".join(cleaned)
        wc = WordCloudMod().generate(text)
        result = list(wc.keys())[:10]
    except Exception as err:
        print(colored.red("At keywords extraction {}".format(err)))
        result = []

    return result


# TODO definitely can be better if we knew where content is

项目：deeppavlov 作者：deepmipt | 项目源码 | 文件源码

def create_batch(self, sentence_li):
        """Create a batch for a list of sentences."""

        embeddings_batch = []
        for sen in sentence_li:
            embeddings = []
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                embeddings.append(self.embdict.tok2emb.get(tok))
            if len(tokens) < self.max_sequence_length:
                pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
                embeddings = pads + embeddings
            else:
                embeddings = embeddings[-self.max_sequence_length:]
            embeddings = np.asarray(embeddings)
            embeddings_batch.append(embeddings)
        embeddings_batch = np.asarray(embeddings_batch)
        return embeddings_batch

项目：Python-Scripts-Repo-on-Data-Science 作者：qalhata | 项目源码 | 文件源码

def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)

项目：Python-Scripts-Repo-on-Data-Science 作者：qalhata | 项目源码 | 文件源码

def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST

项目：stock-eagle 作者：mtusman | 项目源码 | 文件源码

def similarity(c1, c2):
    '''stop words are words like "it" and "the" , that have no massive impact on the 
    sentence'''
    stop_words = list(stopwords.words("english"))
    # Removes stop words in both sentences
    c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
    c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
    c1_words = Counter(dedupe(c1_cleaned))
    c2_words = Counter(dedupe(c2_cleaned))
    total_words = c1_words + c2_words
    similarity_between_words = 0
    for key, val in total_words.items():
        ''' Looks at whether the two articles share a word'''
        if total_words[key] > 1:
            similarity_between_words += 1

    return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))

项目：punctuator2 作者：ottokart | 项目源码 | 文件源码

def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
            output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
            output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ")

项目：MachineComprehension 作者：sa-j | 项目源码 | 文件源码

def check_sent(s):
    count = 0
    for r in s:
        #words = word_tokenize(r)
#        for w in words:
        for w in r:
            if type(w) != str:
                print(w)
                count += 1
                continue
            if w in inv_words or w in oov_words_in_train:
                continue
            if w not in word2vec:
                count += 1
                oov_words_in_train.add(w)
            else:
                inv_words[w] = word2vec.vocab[w].index
    return count

项目：vqa.pytorch 作者：Cadene | 项目源码 | 文件源码

def preprocess_questions(examples, nlp='nltk'):
    if nlp == 'nltk':
        from nltk.tokenize import word_tokenize
    print('Example of generated tokens after preprocessing some questions:')
    for i, ex in enumerate(examples):
        s = ex['question']
        if nlp == 'nltk':
            ex['question_words'] = word_tokenize(str(s).lower())
        elif nlp == 'mcb':
            ex['question_words'] = tokenize_mcb(s)
        else:
            ex['question_words'] = tokenize(s)
        if i < 10:
            print(ex['question_words'])
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(examples), i*100.0/len(examples)) )
            sys.stdout.flush() 
    return examples

项目：delbot 作者：shaildeliwala | 项目源码 | 文件源码

def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]