我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用keras.preprocessing.text.text_to_word_sequence()。
def _handle_rare_words(self, captions): if self._rare_words_handling == 'nothing': return captions elif self._rare_words_handling == 'discard': tokenizer = Tokenizer() tokenizer.fit_on_texts(captions) new_captions = [] for caption in captions: words = text_to_word_sequence(caption) new_words = [w for w in words if tokenizer.word_counts.get(w, 0) >= self._words_min_occur] new_captions.append(' '.join(new_words)) return new_captions raise NotImplementedError('rare_words_handling={} is not implemented ' 'yet!'.format(self._rare_words_handling))
def get_sequences(raw_file, word_count): raw_sequences = [] input_file = open(raw_file) for line in input_file: word_seq = text.text_to_word_sequence(line) raw_sequences.append(word_seq) for w in word_seq: if w in word_count: word_count[w] += 1 else: word_count[w] = 1 input_file.close() return raw_sequences, word_count # index is start from 1
def get_sequences(raw_file, word_count): label_list = [] raw_sequences = [] input_file = open(raw_file) for line in input_file: line_parts = line.strip().split('\t') label = line_parts[0] label_list.append(label) sentence = line_parts[1] word_seq = text.text_to_word_sequence(sentence) raw_sequences.append(word_seq) for w in word_seq: if w in word_count: word_count[w] += 1 else: word_count[w] = 1 input_file.close() return label_list, raw_sequences, word_count # index is start from 1
def normalize_captions(self, captions_txt): captions_txt = self._add_eos(captions_txt) word_sequences = map(text_to_word_sequence, captions_txt) result = map(' '.join, word_sequences) return result
def get_text_sequences(raw_file, word_count): label_list = [] raw_sequences = [] input_file = open(raw_file) for line in input_file: line_parts = line.strip().split('\t') label = line_parts[0] label_list.append(label) sentence = line_parts[1] word_seq = text.text_to_word_sequence(sentence) raw_sequences.append(word_seq) for w in word_seq: if w in word_count: word_count[w] += 1 else: word_count[w] = 1 input_file.close() return label_list, raw_sequences # def insert_to_global(word_count, num_words, global_word_count): # sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True) # for (word, count) in sorted_word_count[:num_words]: # if word in global_word_count: # global_word_count[word] += count # else: # global_word_count[word] = count
def get_encoded_vector(list_of_words, new_string): porter = PorterStemmer() lmtz = WordNetLemmatizer() if 'START_SEQ' not in list_of_words: list_of_words.append('START_SEQ') if 'UNKNOWN_WORDS' not in list_of_words: list_of_words.append('UNKNOWN_WORDS') if 'END_SEQ' not in list_of_words: list_of_words.append('END_SEQ') tokens = text_to_word_sequence(new_string, lower=True, split=" ") # Stem and Lemmatize the data token_stemmed = [] for token in tokens: try: token_stemmed.append(porter.stem(lmtz.lemmatize(token))) except: token_stemmed.append(token) tokens = list(token_stemmed) out = [] all_unknown_words = True for token in tokens: if token in list_of_words: all_unknown_words = False out.append(list_of_words.index(token)) else: out.append(list_of_words.index('UNKNOWN_WORDS')) if all_unknown_words: print('Sentence not recognised:', new_string) out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')] return out