Python gensim.models 模块,TfidfModel() 实例源码

我们从Python开源项目中,提取了以下31个代码示例,用于说明如何使用gensim.models.TfidfModel()

项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.text.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ds = []
    for doc in corpus_tfidf:
        d = [0] * n_items
        for index, value in doc :
            d[index]  = value
        ds.append(d)
    return ds
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda
项目:Answer_Selection    作者:xjtushilei    | 项目源码 | 文件源码
def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]

    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
项目:topical_word_embeddings    作者:thunlp    | 项目源码 | 文件源码
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
项目:weibo_scrawler_app    作者:coolspiderghy    | 项目源码 | 文件源码
def train_by_lsi(lib_texts):
    """
        ??LSI?????
    """
    from gensim import corpora, models, similarities

    #?????????
    #import logging
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]     #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #???????topic???10?LSI??
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus])     # index ? gensim.similarities.docsim.MatrixSimilarity ??

    return (index, dictionary, lsi)


#????? -- ??????????????????????
项目:DeepNews    作者:kabrapratik28    | 项目源码 | 文件源码
def load_model_and_dictionary(self):
        self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
        self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
        print ("Dictionary & Model Loaded Successfully")
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def load_tfidf(corpus, dictionary):
    if not os.path.isfile(TFIDF_MODEL_PATH):
        print('Creating TF-IDF')
        tfidf = models.TfidfModel(corpus)
        print('TF-IDF created')
        tfidf.save(TFIDF_MODEL_PATH)

    print('Loading TF-IDF model')
    tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH)
    return tfidf
# doc_list = get_data()
# print(len(doc_list))
项目:readmeinfo    作者:taozhijiang    | 项目源码 | 文件源码
def do_calc_svd(self):

        print("?????%d" %(nlp_master.get_dict_len()))
        self.k_value = int(0.1*(nlp_master.get_dict_len()))
        if self.k_value < 300:
            self.k_value = 300
        if self.k_value > 1000:
            self.k_value = 1000
        print("k??%d" %(self.k_value))            

        tfidf = models.TfidfModel(list(nlp_master._id_docs.values()))
        tfidf_corpus = tfidf[list(nlp_master._id_docs.values())]

        # num_topics?????????????? 200–500
        # LSI??
        self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000)

        # ??????
        today = datetime.date.today()
        self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day)        

        with open(self.dumpfile,'wb', -1) as fp:
            dump_data = []
            dump_data.append(self._user_classifier)
            dump_data.append(self.k_value)
            dump_data.append(self.lsi)
            pickle.dump(dump_data, fp, -1)

        return


    # ???????NULL???
    # ???????site_news?????????????
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def build_tfidf_base(self, corpus, bow_matrix):
        ## Description: Build and save objects common to TFIDF and LSA
        ## Params: Corpus, BOW matrix
        ## Returns: TF-IDF corpus and matrix

        tfidf_model = models.TfidfModel(corpus)
        tfidf_corpus= tfidf_model[corpus]
        tfidf_matrix = bow_matrix.apply(lambda x: tfidf_model[x[0]], 1)
        return tfidf_corpus, tfidf_matrix


    #MODEL OBJECTS
    #A model object consists of gensim similarity index and matrix containing transformed data
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}

    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)

    return weighted_phrases[:top_n]
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def train_lsi_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def reduce_tfidf(dictionary, weibo_test):
    corpus_tfidf = None
    # # # # ?????  ????????tfidf
    if not os.path.exists(path_tmp_tfidf):
        print('=== ?????tfidf??????????tfidf?? ===')
        # ?????????tfidf???????????????????
        if not dictionary:  # ????????????????????
            dictionary = corpora.Dictionary.load(path_dictionary)
        os.makedirs(path_tmp_tfidf)
        files = os_path.LoadFiles(path_doc_root)
        tfidf_model = models.TfidfModel(dictionary=dictionary)
        corpus_tfidf = {}
        for i, msg in enumerate(files):
            catg = msg[0]
            file = msg[1]
            word_list = convert_doc_to_wordlist(file, cut_all=False)
            file_bow = dictionary.doc2bow(word_list)
            file_tfidf = tfidf_model[file_bow]
            tmp = corpus_tfidf.get(catg, [])
            tmp.append(file_tfidf)
            if tmp.__len__() == 1:
                corpus_tfidf[catg] = tmp
        # ?tfidf????????
        catgs = list(corpus_tfidf.keys())
        for catg in catgs:
            corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
                                       corpus_tfidf.get(catg),
                                       id2word=dictionary
                                       )
            print('catg {c} has been transformed into tfidf vector'.format(c=catg))
        print('=== tfidf?????? ===')
    else:
        print('=== ???tfidf???????????? ===')

    svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def reduce_result(dictionary, lsi_model, predictor, weibo_test):
    # # # # ?????  ????????
    if not dictionary:
        dictionary = corpora.Dictionary.load(path_dictionary)
    if not lsi_model:
        lsi_file = open(path_tmp_lsimodel,'rb')
        lsi_model = pkl.load(lsi_file)
        lsi_file.close()
    if not predictor:
        x = open(path_tmp_predictor,'rb')
        predictor = pkl.load(x)
        x.close()
    files = os.listdir(path_tmp_lsi)
    catg_list = []
    for file in files:
        t = file.split('.')[0]
        if t not in catg_list:
            catg_list.append(t)

    demo_doc = weibo_test
    print(demo_doc)
    demo_doc = list(jieba.cut(demo_doc,cut_all=False))
    demo_bow = dictionary.doc2bow(demo_doc)
    tfidf_model = models.TfidfModel(dictionary=dictionary)
    demo_tfidf = tfidf_model[demo_bow]
    demo_lsi = lsi_model[demo_tfidf]
    data = []
    cols = []
    rows = []
    for item in demo_lsi:
        data.append(item[1])
        cols.append(item[0])
        rows.append(0)
    demo_matrix = csr_matrix((data,(rows,cols))).toarray()
    x = predictor.predict(demo_matrix)
    print('??????{x}'.format(x=catg_list[x[0]]))
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def save_tfidf():
    corpus_bow = corpora.MmCorpus(BowFile)
    tfidf_model = models.TfidfModel(corpus_bow)

    corpus_tfidf = tfidf_model[corpus_bow]
    corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf)

    print "==================== TF-IDF data Generated and Saved ===================="
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def tfidf():
    if not TFIDF:
        return
    doc1 = u'Andrew likes Diet Pepsi.'
    doc2 = u'Andrew knows the muffin man.'
    doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
    corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
    dictionary = corpora.Dictionary(corpus)
    bows = [dictionary.doc2bow(tokens) for tokens in corpus]
    return models.TfidfModel(bows, id2word=dictionary)
项目:lsi-document-similarity    作者:dvictor    | 项目源码 | 文件源码
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def buildCorpus(self):
        """
        Build the corpus from the documents:
            1. Remove words that only appeared once.
            2. Create the Dictionary object.
            3. Convert the documents to simple bag-of-words representation.
            4. Convert the bag-of-words vectors to tf-idf.
        """
        # Remove words that only appear once.
        self.documents = [[token for token in doc if self.frequency[token] > 1]
                          for doc in self.documents]

        # Build a dictionary from the text.
        self.dictionary = corpora.Dictionary(self.documents)

        # Map the documents to vectors.
        corpus = [self.dictionary.doc2bow(text) for text in self.documents]

        # Delete the tokenized representation of the documents--no need to
        # carry this around!
        del self.documents[:]

        # Convert the simple bag-of-words vectors to a tf-idf representation.        
        self.tfidf_model = TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus]
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def load_tfidf_model(self):
        print "loading tfidf from", self.tfidf_filepath
        self.tfidf_model = models.TfidfModel.load(self.tfidf_filepath, mmap='r')
项目:narrative-prediction    作者:roemmele    | 项目源码 | 文件源码
def make_tfidf_model(self, seqs):
        self.tfidf_model = models.TfidfModel((self.lexicon.doc2bow(tokenize(seq)) for seq in seqs))
        self.tfidf_model.save(self.tfidf_filepath)
        print "saved tfidf to", self.tfidf_filepath
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def getLsiModel(lsipath='./lsi/', num_topics=300):
    # ????
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    print '??????'
    # ???
    corpus = corpora.MmCorpus(lsipath +'viva.mm')
    print ('mm load')

    t31 = time.time()

    # tfidf
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    t32 = time.time()
    print "tfidf_corpus time = ", t32 - t31

    # baobao change 3 lines
    # corpus = MyCorpus()
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False)
    # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000)
    lsi = None
    try:
         lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True)  #????????
         lsi.save(lsipath  + 'viva.lsi')
         print('lsi??????')
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception, e:
        logging.error('Failed to lsi train', exc_info=True)

    return lsi
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def tfidf_model(self):
        if self._tfidf_model is None:
            doc_count = self.post_ids_query.count()
            if doc_count < 10:
                return None
            dictionary = self.dictionary
            tfidf_model = gmodels.TfidfModel(id2word=dictionary)
            tfidf_fname = join(self.dirname, "tfidf_%d.model" % (
                self.discussion.id,))
            subcorpus = self.subcorpus
            if exists(tfidf_fname):
                tfidf_model = tfidf_model.load(tfidf_fname)
                # assumption: count implies identity.
                # Wrong in corner cases: hidden, etc.
                if tfidf_model.num_docs != doc_count:
                    unlink(tfidf_fname)
                    tfidf_model = gmodels.TfidfModel(id2word=dictionary)
            if tfidf_model.num_docs != doc_count:
                tfidf_model.initialize(subcorpus)
                tfidf_model.save(tfidf_fname)
            self._tfidf_model = tfidf_model
        return self._tfidf_model
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def tfidf_weight(self):
        self.corpus = models.TfidfModel(self.corpus, normalize=True)
项目:LDA_RecEngine    作者:easonchan1213    | 项目源码 | 文件源码
def trainModel(self):
        '''
        Train a LDA model, inclusive of 4 steps:
        1. Parse the whole corpora into unigram token collections and document mapping (for later use)
        2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
        3. Indexing the token collections and do TF-IDF transformation
        4. Call gensim.models.LdaModel and generate topic distributions of the corpora
        '''
        print 'Start preparing unigram tokens....'      
        ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
        # Get document_count, tokens, and document-index mapping from the corpora
        doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
        # Put the training data into gensim.corpora for later use
        dic = corpora.Dictionary(train_set) 
        denominator = len(dic)
        # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
        dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
        nominator = len(dic)
        corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
        print 'There are %i documents in the pool' % (doc_count)
        print "In the corpus there are ", denominator, " raw tokens"
        print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
        print 'Finished preparing unigram tokens....'   
        ##END 

        print 'Start training LDA model....'
        ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
        corpus_lda = lda[corpus_tfidf]
        # Once done training, print all the topics and related words
        print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'    
        for i in range(self.num_topics):
            print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
        # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
        print '==============================='
        print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
        print '==============================='   

        return lda,doc_mapping,link_mapping,corpus
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_write_name = os.path.join(question_answer_score_label_file_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        list_simes = list(enumerate(sims))
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        #answer_label_list = []
        for item in list_simes:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_score = str(item[1])
            file_write = open(file_write_name,'ab+')
            file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n')
            file_write.close()
            #answer_label_list.append(answer_label)
        #DCG_score = calu_DCG(answer_label_list,k)
        #DCG_score_list.append(DCG_score)
    #DCG_avg = calu_avg_answer_length(DCG_score_list)
    #print 'DCG_avg : \t' + str(DCG_avg)
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def get_score_for_question(question_answer_word_dir,question_num,k):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        answer_label_list = []
        for item in sort_sims:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_label_list.append(answer_label)
        DCG_score = calu_DCG(answer_label_list,k)
        DCG_score_list.append(DCG_score)
    DCG_avg = calu_avg_answer_length(DCG_score_list)
    print 'DCG_avg : \t' + str(DCG_avg)
项目:DeepNews    作者:kabrapratik28    | 项目源码 | 文件源码
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
        textfile = codecs.open(file_path, "r", "utf-8")   

        print("Reading and Processing Text File")
        first_lines=[]
        for line in textfile:
            first_lines.append(line.strip())

        print ("--------Building Corpora Dictionary---------------" )
        dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)

        #remove words that appear less than 2 times
        #twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
        #dictionary.filter_tokens(fiveids)

        #Remove Gaps
        dictionary.compactify()
        dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
        dictionary.save('../../temp_results/tfidf_dictionary')
        print("Dictionary Saved")

        print ("--Now Transforming to Bag of Words Vectors on the Fly--")
        class MyCorpus(object):
            def __iter__(self):
                for line in first_lines:
                    yield dictionary.doc2bow(line.split()) 

        news_corpus  = MyCorpus()
        print("Corpus Built...Now Starting Model Training")
        tfidf_model = models.TfidfModel(news_corpus)
        tfidf_model.save('../../temp_results/tfidf_model')
        print("Model Trained & Saved")
项目:RecommendSystem    作者:dhjack    | 项目源码 | 文件源码
def __init__(self, itemInfos):

        lastTime = time.time()
        # itemInfos : dict[(pid, description)]
        # train model
        jieba.load_userdict('./dict.txt.big.txt')
        stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
        stopWords.add('\n')
        stopWords.add(' ')
        stopWords.add(u'\u2022')
        stopWords.add(u'\xa9')
        texts = []
        self.name2id = {}
        self.id2name = []
        for k, v in itemInfos.iteritems():
            seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
            texts.append(list(seg_list))
            self.name2id[k] = len(self.id2name)
            self.id2name.append(k)

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1] for text in texts]

        print  "start cast :", (time.time() - lastTime)

        lastTime = time.time()
        dictionary = corpora.Dictionary(texts)
        print  "dictionary cast :", (time.time() - lastTime)

        lastTime = time.time()
        corpus = [dictionary.doc2bow(text) for text in texts]
        print  "doc2bow cast :", (time.time() - lastTime)

        lastTime = time.time()
        tfidf = models.TfidfModel(corpus)
        print  "tfid model cast :", (time.time() - lastTime)
        lastTime = time.time()

        lastTime = time.time()
        corpus_tfidf = tfidf[corpus]
        print  "tfidf corpus cast :", (time.time() - lastTime)

        lastTime = time.time()
        self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
        print  "lsi model cast :", (time.time() - lastTime)
        lastTime = time.time()

        #corpus_lsi = lsi[corpus_tfidf] 
        self.index = similarities.MatrixSimilarity(self.lsi[corpus]) 
        self.corpus = corpus

        self.pidName = getPidName()
        print "init finish"