def load(cls, save_dir='./'): """ Load the corpus from a save directory. """ tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb')) tagsToDocs = tables[0] docsToTags = tables[1] titles = pickle.load(open(save_dir + 'titles.pickle', 'rb')) tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model') corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm') dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict') files = pickle.load(open(save_dir + 'files.pickle', 'rb')) doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb')) ksearch = KeySearch(dictionary, tfidf_model, corpus_tfidf, titles, tagsToDocs, docsToTags, files, doc_line_nums) return ksearch
def build_corpus(self, fname=None, save_to=None): # read sentences file if not fname: fname = click.prompt('sentences file') fname = self.__dest(fname) assert os.path.isfile(fname), 'No such file: %s' % fname if save_to: self.corpus_fname = self.__dest(save_to) else: self.corpus_fname = LdaUtils.change_ext(fname, 'corpus') # if there is no corpus file or the user wants to rebuild, build .corpus if not os.path.isfile(self.corpus_fname) or click.confirm('There already is corpus. Do you want to rebuild?'): print 'start building corpus' start = time() corpora.MmCorpus.serialize(self.corpus_fname, self.__iter_doc2bow(LdaUtils.iter_csv(fname, -1).split())) # save print 'building corpus takes: %s' % LdaUtils.human_readable_time(time() - start) self.corpus = corpora.MmCorpus(self.corpus_fname) return self.corpus
def getIndex(lsipath='./lsi/', NUM_TOPIC=300): # ???? corpus = corpora.MmCorpus( lsipath + 'viva.mm') print 'mm loaded' # ???? # lsi = models.LsiModel.load( lsipath + 'viva.lsi') # baobao change 1 line lsi = models.lsimodel.LsiModel.load( lsipath + 'viva.lsi') print 'lsi model loaded' # ?? # index = similarities.MatrixSimilarity(lsi[corpus]) # index.save( lsipath + 'viva.index') # baobao changed 1 line index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[corpus], num_features=NUM_TOPIC) index.save(lsipath + 'viva.index') print('index saved')
def load_corpus(self, infile): self.corpus = corpora.MmCorpus(infile)
def save_corpus(self, corpusfile, dictfile): dictionary = corpora.Dictionary(self.lines) corpus = [dictionary.doc2bow(line) for line in self.lines] dictionary.save(dictfile) corpora.MmCorpus.serialize(corpusfile, corpus)
def save(self, save_dir='./'): """ Write out the built corpus to a save directory. """ # Store the tag tables. pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb')) # Store the document titles. pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb')) # Write out the tfidf model. self.tfidf_model.save(save_dir + 'documents.tfidf_model') # Write out the tfidf corpus. corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf) # Write out the dictionary. self.dictionary.save(save_dir + 'documents.dict') # Save the filenames. pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb')) # Save the file ID and line numbers for each document. pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb')) # Objects that are not saved: # - stop_list - You don't need to filter stop words for new input # text, they simply aren't found in the dictionary. # - frequency - This preliminary word count object is only used for # removing infrequent words. Final word counts are in # the `dictionary` object.
def gensim(self): # https://radimrehurek.com/gensim/dist_lsi.html # https://radimrehurek.com/gensim/models/lsimodel.html corpus = corpora.MmCorpus('../lda/lda_sources/documents_corpus.mm') id2word = corpora.Dictionary.load('../lda/lda_sources/documents_dictionary.dict') lsi = models.LsiModel(corpus, id2word=id2word, num_topics=self.dimensions) return lsi
def test_textcorpus(self): """Make sure TextCorpus can be serialized to disk. """ # construct corpus from file miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2')) # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') corpora.MmCorpus.save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result miislita2 = corpora.MmCorpus(ftmp) self.assertEqual(list(miislita), list(miislita2))
def set_corpus(self, corpus): if isinstance(corpus, str): self.corpus = corpora.MmCorpus(corpus) elif isinstance(corpus, corpora.MmCorpus): self.corpus = corpus
def set_data(self, corpus): if isinstance(corpus, str): self.data = corpora.MmCorpus(corpus) elif isinstance(corpus, corpora.MmCorpus): self.data = corpus
def main(argv): cli_parser = make_cli_parser() opts, args = cli_parser.parse_args(argv) if len(args) != 2: cli_parser.error("Please provide an input/output file") if not os.path.isfile(args[1]+'.lda'): if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'): id2word = corpora.Dictionary.load(args[1]+'.id2word') else : id2word = corpora.Dictionary(iter_file(args[0], opts.numlines)) # ignore words that appear in less than 5 documents or more than 20% documents # when we do filtering, some vector becomes empty! it generates a huge problem!! # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None) # save dictionary id2word.save(args[1]+'.id2word') # save doc2bow vector corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word)) mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm') model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs) model.save(args[1]+'.lda') infile = open(args[0]) outfile = open(args[1]+'.csv', "w") out_csvfile = csv.writer(outfile, delimiter =',') in_csvfile = csv.reader(infile, delimiter=',') for row in in_csvfile: if row[0] == 0: break processed_post = preprocess(row[3]).split() if len(processed_post) == 0: # skip 0~2 word documents (quite useless) continue result_list = row[1:3] result_list.extend(query_tag(id2word, model, processed_post)) out_csvfile.writerow(result_list) infile.close() outfile.close() #print query_tag(id2word, model, "Hello über, world is awesome!")
def load_dict_corpus_all_review(): ''' return the gensim dict&corpus on the whole review corpus :return: dict&corpus ''' if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)): generate_dict_corpus_all_review() print('Reading dict & corpus') dict = corpora.Dictionary.load(DICT_PATH) corpus = corpora.MmCorpus(CORPUS_PATH) print('Reading complicated') return corpus, dict
def save_bow(hotel_files,extra_stopwords=None): corpus = Corpus(hotel_files,extra_stopwords) corpora.MmCorpus.serialize(BowFile,corpus) print "==================== BOW data Generated and Saved ===================="
def save_tfidf(): corpus_bow = corpora.MmCorpus(BowFile) tfidf_model = models.TfidfModel(corpus_bow) corpus_tfidf = tfidf_model[corpus_bow] corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf) print "==================== TF-IDF data Generated and Saved ===================="
def lsi_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_tfidf = corpora.MmCorpus(TfidfFile) N_TOPICS = 300 lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) print "================= LSI MODEL IS BUILT =================" lsi_model.save(LsiModelFile) save_topics(lsi_model,LsiTopicsFile)
def lda_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_bow = corpora.MmCorpus(BowFile) N_TOPICS = 100 model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS) print "================= LDA MODEL IS BUILT =================" model.save(LdaModelFile) save_topics(model,LdaTopicsFile)
def evaluate_cb_recommender(): K = 20 timestr = time.strftime("%Y%m%d-%H%M%S") file_name = '{}-cb-k.csv'.format(timestr) lda_path = './lda/' corpus = corpora.MmCorpus(lda_path + 'corpus.mm'), index = similarities.MatrixSimilarity.load(lda_path + 'index.lda') with open(file_name, 'a', 1) as logfile: logfile.write("rak, pak, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank, user_coverage, " "movie_coverage\n") for K in np.arange(5, 20, 3): recommender = ContentBasedRecs() er = EvaluationRunner(0, None, recommender, K) result = er.calculate(1, 5, number_test_users=-1) user_coverage, movie_coverage = RecommenderCoverage(recommender).calculate_coverage() pak = result['pak'] mae = result['mae'] rak = result['rak'] logfile.write("{}, {}, {}, {}, {}, {}\n".format(rak, pak, mae, K, user_coverage, movie_coverage)) logfile.flush()
def getLsiModel(lsipath='./lsi/', num_topics=300): # ???? dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') print '??????' # ??? corpus = corpora.MmCorpus(lsipath +'viva.mm') print ('mm load') t31 = time.time() # tfidf tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] t32 = time.time() print "tfidf_corpus time = ", t32 - t31 # baobao change 3 lines # corpus = MyCorpus() # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False) # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000) lsi = None try: lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #???????? lsi.save(lsipath + 'viva.lsi') print('lsi??????') except (SystemExit, KeyboardInterrupt): raise except Exception, e: logging.error('Failed to lsi train', exc_info=True) return lsi
def main(text_dir): topics = range(10, 101, 10) + range(120, 201, 20) + range(250, 451, 50) #topics = range(10, 21, 10) #corpus = DocCorpus(text_dir) #dictionary = corpus.dictionary corpus = MmCorpus('../twitter_LDA_topic_modeling/simple-wiki.mm') dictionary = Dictionary.load('../twitter_LDA_topic_modeling/simple-wiki.dict') print('Building LDA models') lda_models = [models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=i, passes=5) for i in tqdm(topics)] print('Generating coherence models') texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus] pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1)) func = partial(build_coherence_models, corpus=corpus, dictionary=dictionary, texts=texts) coherence_models = pool.map(func, lda_models) pool.close() # print('Extracting data from models') # model_data = [extract_data(model, corpus, dictionary) for model in tqdm(lda_models)] # d = defaultdict(list) # print('Generating output data') # for i, data in tqdm(enumerate(model_data)): # d['num_topics'].append(data['num_topics']) # d['cao_juan_2009'].append(cao_juan_2009(data['topic_term_dists'], data['num_topics'])) # d['arun_2010'].append(arun_2010(data['topic_term_dists'], data['doc_topic_dists'], data['doc_lengths'], data['num_topics'])) # d['deveaud_2014'].append(deveaud_2014(data['topic_term_dists'], data['num_topics'])) # d['u_mass_coherence'].append(data['u_mass_coherence']) d = defaultdict(list) print('Generating output data') for data in tqdm(coherence_models): d['num_topics'].append(data['num_topics']) d['u_mass'].append(data['u_mass']) d['c_v'].append(data['c_v']) d['c_uci'].append(data['c_uci']) d['c_npmi'].append(data['c_npmi']) df = pd.DataFrame(d) df = df.set_index('num_topics') df.to_csv('coherence_simple_wiki', sep='\t') df.plot(xticks=df.index, style=['bs-', 'yo-', 'r^-', 'gx-']) ax1 = df.plot(xticks=df.index, style='bs-', grid=True, y='u_mass') ax2 = df.plot(xticks=df.index, style='yo-', grid=True, y='c_v', ax=ax1) ax3 = df.plot(xticks=df.index, style='r^-', grid=True, y='c_npmi', ax=ax2) df.plot(xticks=df.index, style='gx-', grid=True, y='c_uci', ax=ax3) plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17), fancybox=True, shadow=True, ncol=4, fontsize=9) plt.subplots_adjust(bottom=0.2) plt.xticks(df.index, rotation=45, ha='right', fontsize=8) plt.savefig('coherence_simple_wiki') plt.close()
def generate_dict_corpus_all_review(): ''' generate the gensim dict&corpus on the whole review corpus :return: ''' print('Generating new dict and corpus on all Yelp reviews') review_file = open(FULL_YELP_REVIEW_PATH, 'r') # output_review = open("review.json", 'w') # output_tip = open("tip.json", 'w') texts = [] stoplist = load_stopword(STOPWORD_PATH) count = 0 for line in review_file: count += 1 if count % 10000 ==0: print(count) json_review = json.loads(line.strip()) text = json_review.get("text").decode('utf-8').lower() # tokenize and clean. Split non-word&number: re.sub(r'\W+|\d+', '', word.decode('utf-8')). Keep all words:r'\d+' tokens = [re.sub(r'\W+|\d+', '', word) for word in text.split()] # remove stop words and short tokens tokens = [token for token in tokens if ((not token.strip()=='') and (not token in stoplist))] # stemming, experiment shows that stemming works nothing... # if (stemming): # stemmer = PorterStemmer() # texts = [[ stemmer.stem(token) for token in text] for text in texts] texts.append(tokens) review_file.close() # remove words that appear only once # from collections import defaultdict # frequency = defaultdict(int) # for token in tokens: # frequency[token] += 1 # for text in texts: # tokens = [] # for token in text: # if (frequency[token] > 1): # tokens.append(token) # text = tokens # texts = [[token for token in text if (frequency[token] > 1)] for text in texts] print('Corpus preprocessing and counting complished!') dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=5) dictionary.save(DICT_PATH) # store the dictionary, for future reference dictionary.save_as_text(DICT_TXT_PATH) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize(CORPUS_PATH, corpus) # store to disk, for later use print('Generating dict and corpus complished!')
def reduce_lsi(dictionary, corpus_tfidf, weibo_test): corpus_lsi = None lsi_model = None # # # # ????? ???tfidf???lsi if not os.path.exists(path_tmp_lsi): print('=== ?????lsi??????????lsi?? ===') if not dictionary: dictionary = corpora.Dictionary.load(path_dictionary) if not corpus_tfidf: # ??????????????????tfidf?? print('--- ????tfidf??????????? ---') # ????????????? files = os.listdir(path_tmp_tfidf) catg_list = [] for file in files: t = file.split('.')[0] if t not in catg_list: catg_list.append(t) # ??????corpus corpus_tfidf = {} for catg in catg_list: path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg) corpus = corpora.MmCorpus(path) corpus_tfidf[catg] = corpus print('--- tfidf????????????lsi?? ---') # ??lsi model os.makedirs(path_tmp_lsi) corpus_tfidf_total = [] catgs = list(corpus_tfidf.keys()) for catg in catgs: tmp = corpus_tfidf.get(catg) corpus_tfidf_total += tmp lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50) # ?lsi???????? lsi_file = open(path_tmp_lsimodel, 'wb') pkl.dump(lsi_model, lsi_file) lsi_file.close() del corpus_tfidf_total # lsi model??????????? print('--- lsi?????? ---') # ??corpus of lsi, ????? corpus of tfidf corpus_lsi = {} for catg in catgs: corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)] corpus_lsi[catg] = corpu corpus_tfidf.pop(catg) corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg), corpu, id2word=dictionary) print('=== lsi?????? ===') else: print('=== ???lsi???????????? ===') svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)