我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用gensim.corpora.Dictionary()。
def getTextConfidence(self, text): if self.typeOfSim == 'jaccard': intend_confidenceList = [] for i in self.know_words: intend_confidenceList.append(jaccard_compare(text, i)) if len(self.know_words) > 0: return max(intend_confidenceList) else : return 0 elif self.typeOfSim == 'gensim': try: from gensim import corpora, models, similarities except Exception as e: print(e) dictionary = corpora.Dictionary(self.know_words_remove_stopwords) corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) new_doc = text vec_bow = dictionary.doc2bow(new_doc.lower().split()) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(lsi[corpus]) sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) most_sim = sims[0] return most_sim[1]
def get_similarity(query, ans_list): s_lenth = len(ans_list) Corp = ans_list # ?????????? dictionary = corpora.Dictionary(Corp) # ?????????? corpus = [dictionary.doc2bow(text) for text in Corp] lsi = models.LsiModel(corpus) corpus_lsi = lsi[corpus] vec_bow = dictionary.doc2bow(query) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(corpus_lsi) sims = index[vec_lsi] similarity = list(sims) # print(similarity) end_lenth = len(similarity) if s_lenth != end_lenth: print('bug') return similarity
def getCorpus(): documents = [] txtNames = glob.glob("original/*.txt") for fileName in txtNames: fp = open(fileName) buf = fp.readline() documents.append(buf) stoplist = set('for a of the and to in at'.split()) texts = [[word for word in document.translate(string.maketrans("", ""), string.punctuation).lower().split() if word not in stoplist] for document in documents] #Actually dictionary and corpus are of no use here dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=50000) dictionary.save('tmp/imdb.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('tmp/imdb.mm', corpus) return texts
def corpus2bow(self, tokenized_corpus=default_documents): """returns (vocab,corpus_in_bow) ??????? BOW ?? Arguments: tokenized_corpus -- ????????? Return: vocab -- {'human': 0, ... 'minors': 11} corpus_in_bow -- [[(0, 1), (1, 1), (2, 1)]...] """ dictionary = corpora.Dictionary(tokenized_corpus) # ???? vocab = dictionary.token2id # ????????? corpus_in_bow = [dictionary.doc2bow(text) for text in tokenized_corpus] return (vocab, corpus_in_bow)
def build_id2word(self, fname=None, save_to=None): # read words.csv file if not fname: fname = self.words_fname or click.prompt('words file') fname = self.__dest(fname) assert os.path.isfile(fname), 'No such file: %s' % fname if save_to: self.id2word_fname = self.__dest(save_to) else: self.id2word_fname = LdaUtils.change_ext(fname, 'id2word') # if there is no id2word file or the user wants to rebuild, build .id2word if not os.path.isfile(self.id2word_fname) or click.confirm('There alread is id2word. Do you want to rebuild?'): print 'start building id2word' start = time() id2word = corpora.Dictionary(LdaUtils.filter_words(LdaUtils.iter_csv(fname, -1).split())) id2word.save(self.id2word_fname) # save print 'building id2word takes: %s' % LdaUtils.human_readable_time(time() - start) self.id2word = corpora.Dictionary.load(self.id2word_fname) return self.id2word
def get_tfidf(documents): # ??gensim????tfidf documents=[[word for word in document.text.split()] for document in documents] dictionary = corpora.Dictionary(documents) n_items = len(dictionary) corpus = [dictionary.doc2bow(text) for text in documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ds = [] for doc in corpus_tfidf: d = [0] * n_items for index, value in doc : d[index] = value ds.append(d) return ds
def fit(self, documents): ''' parameters: documents: list of strings, each represents a document ''' # tokens, dictionary, corpus for LDA self.tokens = self.preProcessCorpus(documents) self.dictionary = corpora.Dictionary(self.tokens) self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] self.lda = self.getLDA(dictionary=self.dictionary, corpus=self.corpus, num_topics=self.num_topics, random_state=self.random_state) self.num_dominant_topics=min(10, self.num_topics) self.dominant_topic_ids = self.getDominantTopics(self.corpus, self.lda, self.num_dominant_topics)
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = models.TfidfModel(corpus) corpus = [text for text in model[corpus]] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def transfer_corpus(sents): words_dict = invert_dict(corpora.Dictionary.load('words.dict')) max_length = 40 sentence = numpy.zeros(shape=(len(sents), max_length),dtype=numpy.int32) label = numpy.zeros(shape=(len(sents), max_length), dtype=numpy.int32) lengths = [] for i in range(len(sents)): current_sent = sents[i] words = [] labels = [] lengths.append(len(current_sent)) for item in current_sent: words.append(words_dict[item[0]]) labels.append(label_str[item[1]]) sentence[i] = numpy.asarray(words + (max_length - len(current_sent))*[28782],dtype=numpy.float32) label[i] = numpy.asarray(labels + (max_length - len(current_sent))*[8],dtype=numpy.float32) return sentence,label,numpy.asarray(lengths,dtype=numpy.int32) # train = train_ + valid_ = 16551 # test = test = 3327
def build_dictionary(generator, min_freq=5): dictionary_path = os.path.join(DATA_PATH, DICT_NAME) if os.path.exists(dictionary_path) and os.path.isfile(dictionary_path): print("Delete dictionary and rebuild") os.remove(dictionary_path) dictionary = corpora.Dictionary(c + u for c, u in generator) # ?????ID filter_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < min_freq] dictionary.filter_tokens(filter_ids) dictionary.compactify() dictionary.add_documents([_START_VOCAB]) pickle.dump(dictionary, open(dictionary_path, 'wb')) print("SVAE dictionary to %s" % (dictionary_path)) return dictionary
def corpus2dict15(corpusfiles, lowercase=True): """ From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """ corpus = list() corpus.append(["PADDING"]) #has word index 0 corpus.append(["UNKNOWN"]) #has word index 1 for cf in corpusfiles: if cf is not None: #source can be none #just for huge lookuptable that contains all words from pretraining # if lowercase: # corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()]) # else: # corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()]) corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()]) wordDictionary = corpora.Dictionary(corpus) #print "... build word dictionary with vocabulary size =", len(wordDictionary) return wordDictionary
def train_lda_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lda = models.LdaModel(corpus_tfidf, id2word=dictionary, iterations=1000, num_topics=total_topics) return lda
def reduce_dict(weibo_test): dictionary = None if not os.path.exists(path_tmp): os.makedirs(path_tmp) # ????????????????? if not os.path.exists(path_dictionary): dictionary = corpora.Dictionary() files = os_path.LoadFiles(path_doc_root) for i, msg in enumerate(files): catg = msg[0] file = msg[1] file = convert_doc_to_wordlist(file, cut_all=False) dictionary.add_documents([file]) # ?????????????? small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5] dictionary.filter_tokens(small_freq_ids) dictionary.compactify() dictionary.save(path_dictionary) svm_tfidf.reduce_tfidf(dictionary, weibo_test)
def createDictionary(extraLabel=""): # TODO in the report note the optimization done on the dict - it was ~700 000 workds, now ~90 000 dic = Dictionary() d = corpora.Dictionary(dic) d.filter_extremes(no_below=10, no_above=0.6, keep_n=None) d.compactify() # add the visual terms as words in the vocabulary too d.add_documents([get_visual_terms_labels(config)]) extraLabel = extraLabel+"_"+config.dictionary_label fName = 'data/dics/%s_%s.dict' % (pretty_current_time(), extraLabel) d.save(fName+'.bin') d.save_as_text(fName+'.txt') setLastDictFileName(fName+'.bin') logger.info('Dict created and saved to %s. Size: %i' % (fName, len(d))) return d
def generate_training_data(self, options): """ set self.dictionary, self.lable_types and generate train_x(y) and test_x(y) """ input_table = InputTable(options['threashold']) (training, test) = input_table.fetch_data(options['ratio_test'], options['seed']) word_vecs_train = self.convert_to_word_vecs(training) topic_vecs_train = self.convert_to_topic_vecs(training) word_vecs_test = self.convert_to_word_vecs(test) topic_vecs_test = self.convert_to_topic_vecs(test) # use dictionary and topic_types of training set dictionary = corpora.Dictionary(word_vecs_train) all_topics = list(set(topic_vecs_train)) x_train = self.adjust_x_format(dictionary, word_vecs_train) y_train = self.adjust_y_format(all_topics, topic_vecs_train) x_test = self.adjust_x_format(dictionary, word_vecs_test) y_test = self.adjust_y_format(all_topics, topic_vecs_test) return (x_train, y_train, x_test, y_test, dictionary, all_topics)
def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None): t0 = time.time() print(" [*] Creating a new vocabulary...") if not sentence_to_tokens_fn: sentence_to_tokens_fn = default_sentence_to_tokens docs = [] lines = [] for line in input_stream: rline = line.strip() tokens = sentence_to_tokens_fn(rline) if '##########' not in tokens and len(rline) > 0: lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords] elif '##########' in tokens: docs.append(lines) lines = [] limit = np.abs(vocab_size - 4) vocab = corpora.Dictionary(docs) vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit) print(" [*] Tokenize : %.4fs" % (time.time() - t0)) return vocab
def get_similarity(query, ans_list): s_lenth = len(ans_list) Corp = ans_list # ?????????? dictionary = corpora.Dictionary(Corp) # ?????????? corpus = [dictionary.doc2bow(text) for text in Corp] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] vec_bow = dictionary.doc2bow(query) vec_tfidf = tfidf[vec_bow] index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] similarity = list(sims) # print(similarity) end_lenth = len(similarity) if s_lenth != end_lenth: print('bug') return similarity
def load_dict(self, infile): self.dict = corpora.Dictionary.load(infile)
def save_corpus(self, corpusfile, dictfile): dictionary = corpora.Dictionary(self.lines) corpus = [dictionary.doc2bow(line) for line in self.lines] dictionary.save(dictfile) corpora.MmCorpus.serialize(corpusfile, corpus)
def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def buildTokenDictionary(self): """ ? self.segResponses ??????? id """ self.tokenDictionary = corpora.Dictionary(self.segResponses) logging.info("?????????%s" % str(self.tokenDictionary))
def getWordFreq(lib_texts): from gensim import corpora, models, similarities dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] return corpus
def train_by_lsi(lib_texts): """ ??LSI????? """ from gensim import corpora, models, similarities #????????? #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)?? tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #???????topic???10?LSI?? lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ?? return (index, dictionary, lsi) #????? -- ??????????????????????
def query_tag(id2word, model, split_word): # id2word = corpora.Dictionary.load(path+'.id2word') # model = LdaMulticore.load(path+'.lda') bow = id2word.doc2bow(split_word) if len(bow) == 0: return None gamma, _ = model.inference([bow]) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution # [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)] return topic_dist
def main(argv): cli_parser = make_cli_parser() opts, args = cli_parser.parse_args(argv) if len(args) != 2: cli_parser.error("Please provide an input/output file") if not os.path.isfile(args[1]+'.lda'): if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'): id2word = corpora.Dictionary.load(args[1]+'.id2word') else : id2word = corpora.Dictionary(iter_file(args[0], opts.numlines)) # ignore words that appear in less than 5 documents or more than 20% documents # when we do filtering, some vector becomes empty! it generates a huge problem!! # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None) # save dictionary id2word.save(args[1]+'.id2word') # save doc2bow vector corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word)) mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm') model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs) model.save(args[1]+'.lda') infile = open(args[0]) outfile = open(args[1]+'.csv', "w") out_csvfile = csv.writer(outfile, delimiter =',') in_csvfile = csv.reader(infile, delimiter=',') for row in in_csvfile: if row[0] == 0: break processed_post = preprocess(row[3]).split() if len(processed_post) == 0: # skip 0~2 word documents (quite useless) continue result_list = row[1:3] result_list.extend(query_tag(id2word, model, processed_post)) out_csvfile.writerow(result_list) infile.close() outfile.close() #print query_tag(id2word, model, "Hello über, world is awesome!")
def load_model_and_dictionary(self): self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model') self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary') print ("Dictionary & Model Loaded Successfully")
def get_tfidf(documents): # ??gensim????tfidf documents=[[word for word in document.split()] for document in documents] dictionary = corpora.Dictionary(documents) n_items = len(dictionary) corpus = [dictionary.doc2bow(text) for text in documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] return corpus_tfidf
def load_dict_corpus_all_review(): ''' return the gensim dict&corpus on the whole review corpus :return: dict&corpus ''' if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)): generate_dict_corpus_all_review() print('Reading dict & corpus') dict = corpora.Dictionary.load(DICT_PATH) corpus = corpora.MmCorpus(CORPUS_PATH) print('Reading complicated') return corpus, dict
def generateDictionary(self): dictionary=corpora.Dictionary(self.wordProvider) stop_ids=[] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) dictionary.compactify() self.dictionary=dictionary return self.dictionary
def create_dictionary(texts): dictionary = corpora.Dictionary(texts) return dictionary
def get_topics_from_text(line): doc_complete = line.split('.') doc_clean = [clean_txt_to_clean_words(doc).split() for doc in doc_complete]# ignore if length of docs for topic analysis is less than 3 doc_clean_empty = True all_topics = [] for doc in doc_clean: if len(doc) > 0: doc_clean_empty = False if len(doc_clean) >=1 and doc_clean_empty == False: dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] Lda = gensim.models.ldamodel.LdaModel num_topics = 3 ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=25) # print '\n\n',doc_complete # print '\n',doc_clean, '\n' # print ldamodel.print_topics(num_topics=5, num_words=2), '\n\n' for i in range(0,num_topics): topic = ldamodel.get_topic_terms(i, topn=2) topic_list = [] for word in topic: word_name = dictionary.get(word[0]) if len(word_name) > 1: topic_list.append(word_name) topic_list.sort() topic_name = " ".join(topic_list) add = False for ch in topic_name:# ignore numerical topics if ch in r"[abcdefghijklmnopqrstuvwxyz]": add = True if add: if topic_name not in all_topics: all_topics.append(str(topic_name)) return all_topics
def generate_dic(): train_sents = load_corpus('CoNLL-2003/train.txt') valid_sents = load_corpus('CoNLL-2003/valid.txt') test_sents = load_corpus('CoNLL-2003/test.txt') train_ = [get_sent(sent) for sent in train_sents] print("train size: "+str(len(train_sents))) valid_ = [get_sent(sent) for sent in valid_sents] print("valid size: "+str(len(valid_sents))) test_ = [get_sent(sent) for sent in test_sents] print("test size: "+str(len(test_sents))) all_ = train_ + valid_ + test_ lengths = [len(text) for text in all_] print("all data: "+str(len(lengths))) print_lengths(lengths) dic_words = corpora.Dictionary(all_) dic_words.save('words.dict') print(len(dic_words)) # label train_.clear() valid_.clear() test_.clear() train_ = [get_label(sent) for sent in train_sents] valid_ = [get_label(sent) for sent in valid_sents] test_ = [get_label(sent) for sent in test_sents] all_ = train_ + valid_ + test_ dic_labels = corpora.Dictionary(all_) for key,value in dic_labels.items(): print(value) print(len(dic_labels))
def corpus2dict(corpusfiles): """ From a given corpus, create a gensim dictionary for mapping words to ints """ corpus = list() corpus.append(["PADDING"]) #has word index 0 corpus.append(["UNKNOWN"]) #has word index 1 for cf in corpusfiles: #print "INFO: corpus = %s"%(corpusfiles) if cf is not None: #source can be none corpus.extend(preprocess(codecs.open(cf,"r","utf8").readlines())) wordDictionary = corpora.Dictionary(corpus) return wordDictionary
def __init__(self, input=None, topicDict=None, opinionDict=None, testSplit=None, file_dict=None, topicLines=[0], opinionLines=[1]): if not file_dict is None: logger.info('initialize CPT Corpus with file_dict: {} perspectives' .format(len(file_dict))) self.perspectives = [Perspective(file_dict=file_dict.get(str(p)), topicLines=topicLines, opinionLines=opinionLines) for p in range(len(file_dict))] else: logger.info('initialize CPT Corpus with {} perspectives' .format(len(input))) input.sort() self.perspectives = [Perspective(input=glob.glob('{}/*.txt'. format(d)), testSplit=testSplit, topicLines=topicLines, opinionLines=opinionLines) for d in input] self.input = input if isinstance(topicDict, str) or isinstance(topicDict, unicode): self.load_dictionaries(topicDict=topicDict) elif isinstance(topicDict, corpora.Dictionary): self.topicDictionary = topicDict if isinstance(opinionDict, str) or isinstance(opinionDict, unicode): self.load_dictionaries(opinionDict=opinionDict) elif isinstance(opinionDict, corpora.Dictionary): self.opinionDictionary = opinionDict if not topicDict or not opinionDict: self._create_corpus_wide_dictionaries() self.testSplit = testSplit self.nPerspectives = len(self.perspectives)
def load_dictionaries(self, topicDict=None, opinionDict=None): if topicDict: self.topicDictionary = corpora.Dictionary.load(topicDict) logger.info('topic dictionary {}'.format(self.topicDictionary)) if opinionDict: self.opinionDictionary = corpora.Dictionary.load(opinionDict) logger.info('opinion dictionary {}'.format(self.opinionDictionary))
def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10): valid_chunks = get_chunks(sentences, grammar=grammar) dictionary = corpora.Dictionary(valid_chunks) corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] weighted_phrases = {dictionary.get(id): round(value,3) for doc in corpus_tfidf for id, value in doc} weighted_phrases = sorted(weighted_phrases.items(), key=itemgetter(1), reverse=True) return weighted_phrases[:top_n]
def train_lsi_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics) return lsi
def build_dictionary(hotel_files,extra_stopwords=None): stream_of_words = words_stream(hotel_files,extra_stopwords) dictionary = corpora.Dictionary(stream_of_words) dictionary.save(DictionaryFile) # store the dictionary, for future reference print "==================== Dictionary Generated and Saved ===================="
def __init__(self,hotel_files,extra_stopwords = None): self._dictionary = corpora.Dictionary.load(DictionaryFile) self._hotel_files = hotel_files
def lsi_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_tfidf = corpora.MmCorpus(TfidfFile) N_TOPICS = 300 lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) print "================= LSI MODEL IS BUILT =================" lsi_model.save(LsiModelFile) save_topics(lsi_model,LsiTopicsFile)
def lda_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_bow = corpora.MmCorpus(BowFile) N_TOPICS = 100 model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS) print "================= LDA MODEL IS BUILT =================" model.save(LdaModelFile) save_topics(model,LdaTopicsFile)
def load_corpus(data_file): texts = load_texts(data_file) # remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] return corpus,dictionary
def load_corpus(data_file): texts = load_texts(data_file) # remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpus = [[token[0] for token in text] for text in corpus] return corpus, dictionary
def tfidf(): if not TFIDF: return doc1 = u'Andrew likes Diet Pepsi.' doc2 = u'Andrew knows the muffin man.' doc3 = u'Andrew lives near the muffin man on Shirley Lane.' corpus = map(sip.noun_phrases, [doc1, doc2, doc3]) dictionary = corpora.Dictionary(corpus) bows = [dictionary.doc2bow(tokens) for tokens in corpus] return models.TfidfModel(bows, id2word=dictionary)
def LSI_fit(self, data): ''' Fits an LSI model and returns it with associated dictionary ''' texts = [[tag for tag in sent] for sent in self.get_pos(data)] dictionary = corpora.Dictionary(texts) texts = map(dictionary.doc2bow, texts) lsi = models.LsiModel(texts, id2word=dictionary, num_topics=self.num_topics) return dictionary, lsi
def train(self, corpus, passes=1): """Updates dictionary and model given a corpus. Args: corpus: list of str, the documents to tokenize. """ if self.dictionary is not None or self.model is not None: x = raw_input('You are about to overwrite an existing ' 'model file (%s). Are you sure? [y/N] ' % self.model_file) if x[0] != 'y': raise RuntimeError('You chose not to overwrite the ' 'existing model and dictionary.') # Tokenizes the corpus. documents = [self.tokenize(document) for document in corpus] # Builds a dictionary from the existing documents. self.dictionary = corpora.Dictionary(documents) # Dumps the dictionary to a pickled file to use later. pickle.dump(self.dictionary, open(self.dictionary_file, 'wb')) # Converts the corpus to tokens. corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents] # Trains the LSI model. self.model = models.LdaModel(corpus_bow, passes=passes, id2word=self.dictionary, num_topics=self.num_topics) # Saves the model to use later. self.model.save(self.model_file) # Flag to remember that training has taken place. self._trained = True