Python gensim.corpora 模块,MmCorpus() 实例源码

我们从Python开源项目中,提取了以下25个代码示例,用于说明如何使用gensim.corpora.MmCorpus()

项目:wiki-sim-search    作者:chrisjmccormick    | 项目源码 | 文件源码
def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))

        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 

        return ksearch
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def build_corpus(self, fname=None, save_to=None):
        # read sentences file
        if not fname:
            fname = click.prompt('sentences file')
        fname = self.__dest(fname)
        assert os.path.isfile(fname), 'No such file: %s' % fname
        if save_to:
            self.corpus_fname = self.__dest(save_to)
        else:
            self.corpus_fname = LdaUtils.change_ext(fname, 'corpus')
        # if there is no corpus file or the user wants to rebuild, build .corpus
        if not os.path.isfile(self.corpus_fname) or click.confirm('There already is corpus. Do you want to rebuild?'):
            print 'start building corpus'
            start = time()
            corpora.MmCorpus.serialize(self.corpus_fname, self.__iter_doc2bow(LdaUtils.iter_csv(fname, -1).split()))  # save
            print 'building corpus takes: %s' % LdaUtils.human_readable_time(time() - start)
        self.corpus = corpora.MmCorpus(self.corpus_fname)
        return self.corpus
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))

        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 

        return ksearch
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def getIndex(lsipath='./lsi/', NUM_TOPIC=300):
    # ????
    corpus = corpora.MmCorpus( lsipath + 'viva.mm')
    print 'mm loaded'

    # ????
    # lsi = models.LsiModel.load( lsipath + 'viva.lsi')
    # baobao change 1 line
    lsi = models.lsimodel.LsiModel.load( lsipath + 'viva.lsi')
    print 'lsi model loaded'

    # ??
    # index = similarities.MatrixSimilarity(lsi[corpus])
    # index.save( lsipath  + 'viva.index')

    # baobao changed 1 line
    index = similarities.docsim.Similarity(lsipath  + 'viva.index', lsi[corpus], num_features=NUM_TOPIC)
    index.save(lsipath  + 'viva.index')
    print('index saved')
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def load_corpus(self, infile):
        self.corpus = corpora.MmCorpus(infile)
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def save_corpus(self, corpusfile, dictfile):
        dictionary = corpora.Dictionary(self.lines)
        corpus = [dictionary.doc2bow(line) for line in self.lines]
        dictionary.save(dictfile)
        corpora.MmCorpus.serialize(corpusfile, corpus)
项目:wiki-sim-search    作者:chrisjmccormick    | 项目源码 | 文件源码
def save(self, save_dir='./'):
        """
        Write out the built corpus to a save directory.
        """
        # Store the tag tables.
        pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))

        # Store the document titles.
        pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))

        # Write out the tfidf model.
        self.tfidf_model.save(save_dir + 'documents.tfidf_model')

        # Write out the tfidf corpus.
        corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)  

        # Write out the dictionary.
        self.dictionary.save(save_dir + 'documents.dict')

        # Save the filenames.
        pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))

        # Save the file ID and line numbers for each document.
        pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))

        # Objects that are not saved:
        #  - stop_list - You don't need to filter stop words for new input
        #                text, they simply aren't found in the dictionary.
        #  - frequency - This preliminary word count object is only used for
        #                removing infrequent words. Final word counts are in
        #                the `dictionary` object.
项目:nlp-lt    作者:minven    | 项目源码 | 文件源码
def gensim(self):
        # https://radimrehurek.com/gensim/dist_lsi.html
        # https://radimrehurek.com/gensim/models/lsimodel.html
        corpus = corpora.MmCorpus('../lda/lda_sources/documents_corpus.mm')        
        id2word = corpora.Dictionary.load('../lda/lda_sources/documents_dictionary.dict')
        lsi = models.LsiModel(corpus, id2word=id2word, num_topics=self.dimensions)
        return lsi
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2))
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2))
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2))
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def set_corpus(self, corpus):
        if isinstance(corpus, str):
            self.corpus = corpora.MmCorpus(corpus)
        elif isinstance(corpus, corpora.MmCorpus):
            self.corpus = corpus
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def set_data(self, corpus):
        if isinstance(corpus, str):
            self.data = corpora.MmCorpus(corpus)
        elif isinstance(corpus, corpora.MmCorpus):
            self.data = corpus
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def main(argv):
  cli_parser = make_cli_parser()
  opts, args = cli_parser.parse_args(argv)
  if len(args) != 2:
    cli_parser.error("Please provide an input/output file")

  if not os.path.isfile(args[1]+'.lda'):
    if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
      id2word = corpora.Dictionary.load(args[1]+'.id2word')
    else :
      id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
      # ignore words that appear in less than 5 documents or more than 20% documents
      # when we do filtering, some vector becomes empty! it generates a huge problem!!
      # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
      # save dictionary
      id2word.save(args[1]+'.id2word')
      # save doc2bow vector
      corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
    mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
    model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
    model.save(args[1]+'.lda')

  infile = open(args[0])
  outfile = open(args[1]+'.csv', "w")
  out_csvfile = csv.writer(outfile, delimiter =',')
  in_csvfile = csv.reader(infile, delimiter=',')
  for row in in_csvfile:
    if row[0] == 0:
      break
    processed_post = preprocess(row[3]).split()
    if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
      continue
    result_list = row[1:3]
    result_list.extend(query_tag(id2word, model, processed_post))
    out_csvfile.writerow(result_list)
  infile.close()
  outfile.close()

  #print query_tag(id2word, model, "Hello über, world is awesome!")
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def load_dict_corpus_all_review():
    '''
    return the gensim dict&corpus on the whole review corpus
    :return: dict&corpus
    '''
    if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)):
        generate_dict_corpus_all_review()
    print('Reading dict & corpus')
    dict = corpora.Dictionary.load(DICT_PATH)
    corpus = corpora.MmCorpus(CORPUS_PATH)
    print('Reading complicated')
    return corpus, dict
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def save_bow(hotel_files,extra_stopwords=None):
    corpus = Corpus(hotel_files,extra_stopwords)
    corpora.MmCorpus.serialize(BowFile,corpus)
    print "==================== BOW data Generated and Saved ===================="
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def save_tfidf():
    corpus_bow = corpora.MmCorpus(BowFile)
    tfidf_model = models.TfidfModel(corpus_bow)

    corpus_tfidf = tfidf_model[corpus_bow]
    corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf)

    print "==================== TF-IDF data Generated and Saved ===================="
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def lsi_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_tfidf = corpora.MmCorpus(TfidfFile)

    N_TOPICS = 300
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LSI MODEL IS BUILT ================="

    lsi_model.save(LsiModelFile)
    save_topics(lsi_model,LsiTopicsFile)
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def lda_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_bow = corpora.MmCorpus(BowFile)

    N_TOPICS = 100
    model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LDA MODEL IS BUILT ================="

    model.save(LdaModelFile)
    save_topics(model,LdaTopicsFile)
项目:moviegeek    作者:practical-recommender-systems    | 项目源码 | 文件源码
def evaluate_cb_recommender():

    K = 20
    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = '{}-cb-k.csv'.format(timestr)

    lda_path = './lda/'
    corpus = corpora.MmCorpus(lda_path + 'corpus.mm'),
    index = similarities.MatrixSimilarity.load(lda_path + 'index.lda')

    with open(file_name, 'a', 1) as logfile:
        logfile.write("rak, pak, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank, user_coverage, "
                      "movie_coverage\n")

        for K in np.arange(5, 20, 3):
            recommender = ContentBasedRecs()

            er = EvaluationRunner(0,
                                  None,
                                  recommender,
                                  K)

            result = er.calculate(1, 5, number_test_users=-1)

            user_coverage, movie_coverage = RecommenderCoverage(recommender).calculate_coverage()

            pak = result['pak']
            mae = result['mae']
            rak = result['rak']
            logfile.write("{}, {}, {}, {}, {}, {}\n".format(rak, pak, mae, K, user_coverage, movie_coverage))
            logfile.flush()
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def save(self, save_dir='./'):
        """
        Write out the built corpus to a save directory.
        """
        # Store the tag tables.
        pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))

        # Store the document titles.
        pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))

        # Write out the tfidf model.
        self.tfidf_model.save(save_dir + 'documents.tfidf_model')

        # Write out the tfidf corpus.
        corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)  

        # Write out the dictionary.
        self.dictionary.save(save_dir + 'documents.dict')

        # Save the filenames.
        pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))

        # Save the file ID and line numbers for each document.
        pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))

        # Objects that are not saved:
        #  - stop_list - You don't need to filter stop words for new input
        #                text, they simply aren't found in the dictionary.
        #  - frequency - This preliminary word count object is only used for
        #                removing infrequent words. Final word counts are in
        #                the `dictionary` object.
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def getLsiModel(lsipath='./lsi/', num_topics=300):
    # ????
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    print '??????'
    # ???
    corpus = corpora.MmCorpus(lsipath +'viva.mm')
    print ('mm load')

    t31 = time.time()

    # tfidf
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    t32 = time.time()
    print "tfidf_corpus time = ", t32 - t31

    # baobao change 3 lines
    # corpus = MyCorpus()
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
    # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
    lsi = None
    try:
         lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True)  #????????
         lsi.save(lsipath  + 'viva.lsi')
         print('lsi??????')
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception, e:
        logging.error('Failed to lsi train', exc_info=True)

    return lsi
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def main(text_dir):
    topics = range(10, 101, 10) + range(120, 201, 20) + range(250, 451, 50)
    #topics = range(10, 21, 10)
    #corpus = DocCorpus(text_dir)
    #dictionary = corpus.dictionary
    corpus = MmCorpus('../twitter_LDA_topic_modeling/simple-wiki.mm')
    dictionary = Dictionary.load('../twitter_LDA_topic_modeling/simple-wiki.dict')
    print('Building LDA models')
    lda_models = [models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=i, passes=5) for i in tqdm(topics)]
    print('Generating coherence models')
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
    func = partial(build_coherence_models, 
                   corpus=corpus,
                   dictionary=dictionary,
                   texts=texts)
    coherence_models = pool.map(func, lda_models)
    pool.close()
#    print('Extracting data from models')
#    model_data = [extract_data(model, corpus, dictionary) for model in tqdm(lda_models)]
#    d = defaultdict(list)
#    print('Generating output data')
#    for i, data in tqdm(enumerate(model_data)):
#        d['num_topics'].append(data['num_topics'])
#        d['cao_juan_2009'].append(cao_juan_2009(data['topic_term_dists'], data['num_topics']))
#        d['arun_2010'].append(arun_2010(data['topic_term_dists'], data['doc_topic_dists'], data['doc_lengths'], data['num_topics']))
#        d['deveaud_2014'].append(deveaud_2014(data['topic_term_dists'], data['num_topics']))
#        d['u_mass_coherence'].append(data['u_mass_coherence'])
    d = defaultdict(list)
    print('Generating output data')
    for data in tqdm(coherence_models):
        d['num_topics'].append(data['num_topics'])
        d['u_mass'].append(data['u_mass'])
        d['c_v'].append(data['c_v'])
        d['c_uci'].append(data['c_uci'])
        d['c_npmi'].append(data['c_npmi'])
    df = pd.DataFrame(d)
    df = df.set_index('num_topics')
    df.to_csv('coherence_simple_wiki', sep='\t')
    df.plot(xticks=df.index, style=['bs-', 'yo-', 'r^-', 'gx-'])
    ax1 = df.plot(xticks=df.index, style='bs-', grid=True, y='u_mass')
    ax2 = df.plot(xticks=df.index, style='yo-', grid=True, y='c_v', ax=ax1)
    ax3 = df.plot(xticks=df.index, style='r^-', grid=True, y='c_npmi', ax=ax2)
    df.plot(xticks=df.index, style='gx-', grid=True, y='c_uci', ax=ax3)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.17), fancybox=True, shadow=True, ncol=4, fontsize=9)
    plt.subplots_adjust(bottom=0.2)
    plt.xticks(df.index, rotation=45, ha='right', fontsize=8)
    plt.savefig('coherence_simple_wiki')
    plt.close()
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def generate_dict_corpus_all_review():
    '''
    generate the gensim dict&corpus on the whole review corpus
    :return:
    '''

    print('Generating new dict and corpus on all Yelp reviews')

    review_file = open(FULL_YELP_REVIEW_PATH, 'r')
    # output_review = open("review.json", 'w')
    # output_tip = open("tip.json", 'w')

    texts = []
    stoplist = load_stopword(STOPWORD_PATH)

    count = 0

    for line in review_file:
        count += 1
        if count % 10000 ==0:
            print(count)
        json_review = json.loads(line.strip())

        text = json_review.get("text").decode('utf-8').lower()
        # tokenize and clean. Split non-word&number: re.sub(r'\W+|\d+', '', word.decode('utf-8')). Keep all words:r'\d+'
        tokens = [re.sub(r'\W+|\d+', '', word) for word in text.split()]
        # remove stop words and short tokens
        tokens = [token for token in tokens if ((not token.strip()=='') and (not token in stoplist))]
        # stemming, experiment shows that stemming works nothing...
        # if (stemming):
        #     stemmer = PorterStemmer()
        #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
        texts.append(tokens)

    review_file.close()

    # remove words that appear only once
    # from collections import defaultdict
    # frequency = defaultdict(int)
    # for token in tokens:
    #     frequency[token] += 1
    # for text in texts:
    #     tokens = []
    #     for token in text:
    #         if (frequency[token] > 1):
    #             tokens.append(token)
    #     text = tokens
    # texts = [[token for token in text if (frequency[token] > 1)] for text in texts]

    print('Corpus preprocessing and counting complished!')

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=5)

    dictionary.save(DICT_PATH) # store the dictionary, for future reference
    dictionary.save_as_text(DICT_TXT_PATH)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize(CORPUS_PATH, corpus) # store to disk, for later use
    print('Generating dict and corpus complished!')
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def reduce_lsi(dictionary, corpus_tfidf, weibo_test):
    corpus_lsi = None
    lsi_model = None
    # # # # ?????  ???tfidf???lsi
    if not os.path.exists(path_tmp_lsi):
        print('=== ?????lsi??????????lsi?? ===')
        if not dictionary:
            dictionary = corpora.Dictionary.load(path_dictionary)
        if not corpus_tfidf:  # ??????????????????tfidf??
            print('--- ????tfidf??????????? ---')
            # ?????????????
            files = os.listdir(path_tmp_tfidf)
            catg_list = []
            for file in files:
                t = file.split('.')[0]
                if t not in catg_list:
                    catg_list.append(t)

            # ??????corpus
            corpus_tfidf = {}
            for catg in catg_list:
                path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg)
                corpus = corpora.MmCorpus(path)
                corpus_tfidf[catg] = corpus
            print('--- tfidf????????????lsi?? ---')

        # ??lsi model
        os.makedirs(path_tmp_lsi)
        corpus_tfidf_total = []
        catgs = list(corpus_tfidf.keys())
        for catg in catgs:
            tmp = corpus_tfidf.get(catg)
            corpus_tfidf_total += tmp
        lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50)
        # ?lsi????????
        lsi_file = open(path_tmp_lsimodel, 'wb')
        pkl.dump(lsi_model, lsi_file)
        lsi_file.close()
        del corpus_tfidf_total  # lsi model???????????
        print('--- lsi?????? ---')

        # ??corpus of lsi, ????? corpus of tfidf
        corpus_lsi = {}
        for catg in catgs:
            corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)]
            corpus_lsi[catg] = corpu
            corpus_tfidf.pop(catg)
            corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg),
                                       corpu,
                                       id2word=dictionary)
        print('=== lsi?????? ===')
    else:
        print('=== ???lsi???????????? ===')

    svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)