Python gensim.corpora 模块，Dictionary() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用gensim.corpora.Dictionary()。

项目：IntentParser 作者：nonkung51 | 项目源码 | 文件源码

def getTextConfidence(self, text):
        if self.typeOfSim == 'jaccard':
            intend_confidenceList = []
            for i in self.know_words:
                intend_confidenceList.append(jaccard_compare(text, i))
            if len(self.know_words) > 0:
                return max(intend_confidenceList)
            else :
                return 0
        elif self.typeOfSim == 'gensim':
            try:
                from gensim import corpora, models, similarities
            except Exception as e:
                print(e)
            dictionary = corpora.Dictionary(self.know_words_remove_stopwords)
            corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords]
            lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
            new_doc = text
            vec_bow = dictionary.doc2bow(new_doc.lower().split())
            vec_lsi = lsi[vec_bow]
            index = similarities.MatrixSimilarity(lsi[corpus])
            sims = index[vec_lsi]
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            most_sim = sims[0]
            return most_sim[1]

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    lsi = models.LsiModel(corpus)
    corpus_lsi = lsi[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_lsi = lsi[vec_bow]

    index = similarities.MatrixSimilarity(corpus_lsi)
    sims = index[vec_lsi]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity

项目：USTC_AILab2 作者：overflocat | 项目源码 | 文件源码

def getCorpus():
    documents = []
    txtNames = glob.glob("original/*.txt")
    for fileName in txtNames:
        fp = open(fileName)
        buf = fp.readline()
        documents.append(buf)

    stoplist = set('for a of the and to in at'.split())
    texts = [[word for word in document.translate(string.maketrans("", ""), string.punctuation).lower().split() if word not in stoplist]
             for document in documents]

    #Actually dictionary and corpus are of no use here
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=50000)
    dictionary.save('tmp/imdb.dict')

    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('tmp/imdb.mm', corpus)

    return texts

项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者：wxyyxc1992 | 项目源码 | 文件源码

def corpus2bow(self, tokenized_corpus=default_documents):
        """returns (vocab,corpus_in_bow)

        ??????? BOW ??

        Arguments:
        tokenized_corpus -- ?????????

        Return:
        vocab -- {'human': 0, ... 'minors': 11}
        corpus_in_bow -- [[(0, 1), (1, 1), (2, 1)]...]
        """
        dictionary = corpora.Dictionary(tokenized_corpus)

        # ????
        vocab = dictionary.token2id

        # ?????????
        corpus_in_bow = [dictionary.doc2bow(text) for text in tokenized_corpus]

        return (vocab, corpus_in_bow)

项目：Sentences-analysis 作者：sungminoh | 项目源码 | 文件源码

def build_id2word(self, fname=None, save_to=None):
        # read words.csv file
        if not fname:
            fname = self.words_fname or click.prompt('words file')
        fname = self.__dest(fname)
        assert os.path.isfile(fname), 'No such file: %s' % fname
        if save_to:
            self.id2word_fname = self.__dest(save_to)
        else:
            self.id2word_fname = LdaUtils.change_ext(fname, 'id2word')
        # if there is no id2word file or the user wants to rebuild, build .id2word
        if not os.path.isfile(self.id2word_fname) or click.confirm('There alread is id2word. Do you want to rebuild?'):
            print 'start building id2word'
            start = time()
            id2word = corpora.Dictionary(LdaUtils.filter_words(LdaUtils.iter_csv(fname, -1).split()))
            id2word.save(self.id2word_fname)  # save
            print 'building id2word takes: %s' % LdaUtils.human_readable_time(time() - start)
        self.id2word = corpora.Dictionary.load(self.id2word_fname)
        return self.id2word

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.text.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ds = []
    for doc in corpus_tfidf:
        d = [0] * n_items
        for index, value in doc :
            d[index]  = value
        ds.append(d)
    return ds

项目：TextSummarization 作者：g-deoliveira | 项目源码 | 文件源码

def fit(self, documents):
        '''
        parameters:
          documents: list of strings, each represents a document
        '''

        # tokens, dictionary, corpus for LDA
        self.tokens = self.preProcessCorpus(documents)
        self.dictionary = corpora.Dictionary(self.tokens)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]

        self.lda = self.getLDA(dictionary=self.dictionary, 
                               corpus=self.corpus, 
                               num_topics=self.num_topics, 
                               random_state=self.random_state)

        self.num_dominant_topics=min(10, self.num_topics)
        self.dominant_topic_ids = self.getDominantTopics(self.corpus, 
                                                         self.lda, 
                                                         self.num_dominant_topics)

项目：OpinionSpam 作者：Coder-Yu | 项目源码 | 文件源码

def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)

项目：OpinionSpam 作者：Coder-Yu | 项目源码 | 文件源码

def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)

项目：sequence-labeling 作者：BUAAQingYuan | 项目源码 | 文件源码

def transfer_corpus(sents):
    words_dict = invert_dict(corpora.Dictionary.load('words.dict'))
    max_length = 40
    sentence = numpy.zeros(shape=(len(sents), max_length),dtype=numpy.int32)
    label = numpy.zeros(shape=(len(sents), max_length), dtype=numpy.int32)
    lengths = []
    for i in range(len(sents)):
        current_sent = sents[i]
        words = []
        labels = []
        lengths.append(len(current_sent))
        for item in current_sent:
            words.append(words_dict[item[0]])
            labels.append(label_str[item[1]])
        sentence[i] = numpy.asarray(words + (max_length - len(current_sent))*[28782],dtype=numpy.float32)
        label[i] = numpy.asarray(labels + (max_length - len(current_sent))*[8],dtype=numpy.float32)

    return sentence,label,numpy.asarray(lengths,dtype=numpy.int32)


# train = train_ + valid_ = 16551
# test = test = 3327

项目：DeepBot 作者：IgorWang | 项目源码 | 文件源码

def build_dictionary(generator, min_freq=5):
    dictionary_path = os.path.join(DATA_PATH, DICT_NAME)

    if os.path.exists(dictionary_path) and os.path.isfile(dictionary_path):
        print("Delete dictionary and rebuild")
        os.remove(dictionary_path)

    dictionary = corpora.Dictionary(c + u for c, u in generator)

    # ?????ID
    filter_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if
                  docfreq < min_freq]

    dictionary.filter_tokens(filter_ids)
    dictionary.compactify()

    dictionary.add_documents([_START_VOCAB])

    pickle.dump(dictionary, open(dictionary_path, 'wb'))
    print("SVAE dictionary to %s" % (dictionary_path))

    return dictionary

项目：quetch 作者：juliakreutzer | 项目源码 | 文件源码

def corpus2dict15(corpusfiles, lowercase=True): 
    """ From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """
    corpus = list()
    corpus.append(["PADDING"]) #has word index 0
    corpus.append(["UNKNOWN"]) #has word index 1
    for cf in corpusfiles:
        if cf is not None: #source can be none

#just for huge lookuptable that contains all words from pretraining
#           if lowercase:
#               corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()])
#           else:
#               corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])

            corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
    wordDictionary = corpora.Dictionary(corpus)
    #print "... build word dictionary with vocabulary size =", len(wordDictionary)
    return wordDictionary

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda

项目：SinaWeiboSpider 作者：SuperSaiyanSSS | 项目源码 | 文件源码

def reduce_dict(weibo_test):
    dictionary = None
    if not os.path.exists(path_tmp):
        os.makedirs(path_tmp)
    # ?????????????????
    if not os.path.exists(path_dictionary):
        dictionary = corpora.Dictionary()
        files = os_path.LoadFiles(path_doc_root)
        for i, msg in enumerate(files):
            catg = msg[0]
            file = msg[1]
            file = convert_doc_to_wordlist(file, cut_all=False)
            dictionary.add_documents([file])
        # ??????????????
        small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5]
        dictionary.filter_tokens(small_freq_ids)
        dictionary.compactify()
        dictionary.save(path_dictionary)
    svm_tfidf.reduce_tfidf(dictionary, weibo_test)

项目：topic_modelling 作者：jorotenev | 项目源码 | 文件源码

def createDictionary(extraLabel=""):
    # TODO in the report note the optimization done on the dict - it was ~700 000 workds, now ~90 000
    dic = Dictionary()
    d = corpora.Dictionary(dic)

    d.filter_extremes(no_below=10, no_above=0.6, keep_n=None)
    d.compactify()
    # add the visual terms as words in the vocabulary too
    d.add_documents([get_visual_terms_labels(config)])
    extraLabel = extraLabel+"_"+config.dictionary_label
    fName = 'data/dics/%s_%s.dict' % (pretty_current_time(), extraLabel)
    d.save(fName+'.bin')

    d.save_as_text(fName+'.txt')
    setLastDictFileName(fName+'.bin')
    logger.info('Dict created and saved to %s. Size: %i' % (fName, len(d)))
    return d

项目：semantic_selector 作者：toshiya | 项目源码 | 文件源码

def generate_training_data(self, options):
        """
        set self.dictionary, self.lable_types and
        generate train_x(y) and test_x(y)
        """
        input_table = InputTable(options['threashold'])
        (training, test) = input_table.fetch_data(options['ratio_test'],
                                                  options['seed'])

        word_vecs_train = self.convert_to_word_vecs(training)
        topic_vecs_train = self.convert_to_topic_vecs(training)
        word_vecs_test = self.convert_to_word_vecs(test)
        topic_vecs_test = self.convert_to_topic_vecs(test)

        # use dictionary and topic_types of training set
        dictionary = corpora.Dictionary(word_vecs_train)
        all_topics = list(set(topic_vecs_train))

        x_train = self.adjust_x_format(dictionary, word_vecs_train)
        y_train = self.adjust_y_format(all_topics, topic_vecs_train)
        x_test = self.adjust_x_format(dictionary, word_vecs_test)
        y_test = self.adjust_y_format(all_topics, topic_vecs_test)
        return (x_train, y_train, x_test, y_test, dictionary, all_topics)

项目：attention-over-attention 作者：marshmelloX | 项目源码 | 文件源码

def create_vocabulary(input_stream, vocab_size, sentence_to_tokens_fn=None):
  t0 = time.time()
  print(" [*] Creating a new vocabulary...")

  if not sentence_to_tokens_fn:
    sentence_to_tokens_fn = default_sentence_to_tokens

  docs = []
  lines = []
  for line in input_stream:
    rline = line.strip()
    tokens = sentence_to_tokens_fn(rline)
    if '##########' not in tokens and len(rline) > 0:
      lines += [token.lower() for token in tokens if token.lower() not in cachedStopWords]
    elif '##########' in tokens:
      docs.append(lines)
      lines = []

  limit = np.abs(vocab_size - 4)
  vocab = corpora.Dictionary(docs)
  vocab.filter_extremes(no_below=1, no_above=0.7, keep_n=limit)
  print(" [*] Tokenize : %.4fs" % (time.time() - t0))

  return vocab

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def get_similarity(query, ans_list):
    s_lenth = len(ans_list)
    Corp = ans_list
    # ??????????
    dictionary = corpora.Dictionary(Corp)
    # ??????????
    corpus = [dictionary.doc2bow(text) for text in Corp]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    vec_bow = dictionary.doc2bow(query)
    vec_tfidf = tfidf[vec_bow]

    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)
    # print(similarity)
    end_lenth = len(similarity)
    if s_lenth != end_lenth:
        print('bug')
    return similarity

项目：quoll 作者：LanguageMachines | 项目源码 | 文件源码

def load_dict(self, infile):
        self.dict = corpora.Dictionary.load(infile)

项目：quoll 作者：LanguageMachines | 项目源码 | 文件源码

def save_corpus(self, corpusfile, dictfile):
        dictionary = corpora.Dictionary(self.lines)
        corpus = [dictionary.doc2bow(line) for line in self.lines]
        dictionary.save(dictfile)
        corpora.MmCorpus.serialize(corpusfile, corpus)

项目：paragraph2vec 作者：thunlp | 项目源码 | 文件源码

def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

项目：PTTChatBot_DL2017 作者：thisray | 项目源码 | 文件源码

def buildTokenDictionary(self):
        """
        ? self.segResponses ??????? id
        """
        self.tokenDictionary = corpora.Dictionary(self.segResponses)
        logging.info("?????????%s" % str(self.tokenDictionary))

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

项目：weibo_scrawler_app 作者：coolspiderghy | 项目源码 | 文件源码

def getWordFreq(lib_texts):
    from gensim import corpora, models, similarities
    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]
    return corpus

项目：weibo_scrawler_app 作者：coolspiderghy | 项目源码 | 文件源码

def train_by_lsi(lib_texts):
    """
        ??LSI?????
    """
    from gensim import corpora, models, similarities

    #?????????
    #import logging
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts]     #doc2bow(): ?collection words ?????????(word_id, word_frequency)??
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #???????topic???10?LSI??
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus])     # index ? gensim.similarities.docsim.MatrixSimilarity ??

    return (index, dictionary, lsi)


#????? -- ??????????????????????

项目：Sentences-analysis 作者：sungminoh | 项目源码 | 文件源码

def query_tag(id2word, model, split_word):
  # id2word = corpora.Dictionary.load(path+'.id2word')
  # model = LdaMulticore.load(path+'.lda')
  bow = id2word.doc2bow(split_word)
  if len(bow) == 0:
    return None
  gamma, _ = model.inference([bow])
  topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution
  # [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)]
  return topic_dist

项目：Sentences-analysis 作者：sungminoh | 项目源码 | 文件源码

def main(argv):
  cli_parser = make_cli_parser()
  opts, args = cli_parser.parse_args(argv)
  if len(args) != 2:
    cli_parser.error("Please provide an input/output file")

  if not os.path.isfile(args[1]+'.lda'):
    if os.path.isfile(args[1]+'.bow2mm') and os.path.isfile(args[1]+'.id2word'):
      id2word = corpora.Dictionary.load(args[1]+'.id2word')
    else :
      id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
      # ignore words that appear in less than 5 documents or more than 20% documents
      # when we do filtering, some vector becomes empty! it generates a huge problem!!
      # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
      # save dictionary
      id2word.save(args[1]+'.id2word')
      # save doc2bow vector
      corpora.MmCorpus.serialize(args[1]+'.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word))
    mm_corpus = corpora.MmCorpus(args[1]+'.bow2mm')
    model=LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs)
    model.save(args[1]+'.lda')

  infile = open(args[0])
  outfile = open(args[1]+'.csv', "w")
  out_csvfile = csv.writer(outfile, delimiter =',')
  in_csvfile = csv.reader(infile, delimiter=',')
  for row in in_csvfile:
    if row[0] == 0:
      break
    processed_post = preprocess(row[3]).split()
    if len(processed_post) == 0: # skip 0~2 word documents (quite useless)
      continue
    result_list = row[1:3]
    result_list.extend(query_tag(id2word, model, processed_post))
    out_csvfile.writerow(result_list)
  infile.close()
  outfile.close()

  #print query_tag(id2word, model, "Hello über, world is awesome!")

项目：DeepNews 作者：kabrapratik28 | 项目源码 | 文件源码

def load_model_and_dictionary(self):
        self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model')
        self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary')
        print ("Dictionary & Model Loaded Successfully")

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def get_tfidf(documents):  # ??gensim????tfidf
    documents=[[word for word in document.split()] for document in documents]
    dictionary = corpora.Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    return corpus_tfidf

项目：YelpDataChallenge 作者：fujunswufe | 项目源码 | 文件源码

def load_dict_corpus_all_review():
    '''
    return the gensim dict&corpus on the whole review corpus
    :return: dict&corpus
    '''
    if not (os.path.isfile(DICT_PATH) and os.path.isfile(CORPUS_PATH)):
        generate_dict_corpus_all_review()
    print('Reading dict & corpus')
    dict = corpora.Dictionary.load(DICT_PATH)
    corpus = corpora.MmCorpus(CORPUS_PATH)
    print('Reading complicated')
    return corpus, dict

项目：Rnews 作者：suemi994 | 项目源码 | 文件源码

def generateDictionary(self):
        dictionary=corpora.Dictionary(self.wordProvider)
        stop_ids=[]
        once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
        dictionary.filter_tokens(stop_ids + once_ids)
        dictionary.compactify()
        self.dictionary=dictionary
        return self.dictionary

项目：natural-language-preprocessings 作者：Hironsan | 项目源码 | 文件源码

def create_dictionary(texts):
    dictionary = corpora.Dictionary(texts)
    return dictionary

项目：Opened 作者：Veerendra-Gopi | 项目源码 | 文件源码

def get_topics_from_text(line):
    doc_complete = line.split('.')
    doc_clean = [clean_txt_to_clean_words(doc).split() for doc in doc_complete]# ignore if length of docs for topic analysis is less than 3        
    doc_clean_empty = True
    all_topics = []
    for doc in doc_clean:
        if len(doc) > 0:
            doc_clean_empty = False
    if len(doc_clean) >=1 and doc_clean_empty == False:
        dictionary = corpora.Dictionary(doc_clean)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
        Lda = gensim.models.ldamodel.LdaModel
        num_topics = 3
        ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=25)
        # print '\n\n',doc_complete
        # print '\n',doc_clean, '\n'
        # print ldamodel.print_topics(num_topics=5, num_words=2), '\n\n'
        for i in range(0,num_topics):
            topic = ldamodel.get_topic_terms(i, topn=2)
            topic_list = []
            for word in topic:
                word_name = dictionary.get(word[0])
                if len(word_name) > 1:
                    topic_list.append(word_name)
            topic_list.sort()
            topic_name = " ".join(topic_list)
            add = False
            for ch in topic_name:# ignore numerical topics
                if ch in r"[abcdefghijklmnopqrstuvwxyz]":
                    add = True
            if add:
                if topic_name not in all_topics:
                    all_topics.append(str(topic_name))

    return all_topics

项目：sequence-labeling 作者：BUAAQingYuan | 项目源码 | 文件源码

def generate_dic():
    train_sents = load_corpus('CoNLL-2003/train.txt')
    valid_sents = load_corpus('CoNLL-2003/valid.txt')
    test_sents = load_corpus('CoNLL-2003/test.txt')
    train_ = [get_sent(sent) for sent in train_sents]
    print("train size: "+str(len(train_sents)))
    valid_ = [get_sent(sent) for sent in valid_sents]
    print("valid size: "+str(len(valid_sents)))
    test_ = [get_sent(sent) for sent in test_sents]
    print("test size: "+str(len(test_sents)))
    all_ = train_ + valid_ + test_
    lengths = [len(text) for text in all_]
    print("all data: "+str(len(lengths)))
    print_lengths(lengths)
    dic_words = corpora.Dictionary(all_)
    dic_words.save('words.dict')
    print(len(dic_words))
    # label
    train_.clear()
    valid_.clear()
    test_.clear()
    train_ = [get_label(sent) for sent in train_sents]
    valid_ = [get_label(sent) for sent in valid_sents]
    test_ = [get_label(sent) for sent in test_sents]
    all_ = train_ + valid_ + test_
    dic_labels = corpora.Dictionary(all_)
    for key,value in dic_labels.items():
        print(value)
    print(len(dic_labels))

项目：quetch 作者：juliakreutzer | 项目源码 | 文件源码

def corpus2dict(corpusfiles):
    """ From a given corpus, create a gensim dictionary for mapping words to ints """
    corpus = list()
    corpus.append(["PADDING"]) #has word index 0
    corpus.append(["UNKNOWN"]) #has word index 1
    for cf in corpusfiles:
        #print "INFO: corpus = %s"%(corpusfiles)
        if cf is not None: #source can be none
            corpus.extend(preprocess(codecs.open(cf,"r","utf8").readlines()))
    wordDictionary = corpora.Dictionary(corpus)
    return wordDictionary

项目：cptm 作者：NLeSC | 项目源码 | 文件源码

def __init__(self, input=None, topicDict=None, opinionDict=None,
                 testSplit=None, file_dict=None, topicLines=[0],
                 opinionLines=[1]):
        if not file_dict is None:
            logger.info('initialize CPT Corpus with file_dict: {} perspectives'
                        .format(len(file_dict)))
            self.perspectives = [Perspective(file_dict=file_dict.get(str(p)),
                                             topicLines=topicLines,
                                             opinionLines=opinionLines)
                                 for p in range(len(file_dict))]
        else:
            logger.info('initialize CPT Corpus with {} perspectives'
                        .format(len(input)))
            input.sort()
            self.perspectives = [Perspective(input=glob.glob('{}/*.txt'.
                                             format(d)), testSplit=testSplit,
                                             topicLines=topicLines,
                                             opinionLines=opinionLines)
                                 for d in input]
            self.input = input

        if isinstance(topicDict, str) or isinstance(topicDict, unicode):
            self.load_dictionaries(topicDict=topicDict)
        elif isinstance(topicDict, corpora.Dictionary):
            self.topicDictionary = topicDict

        if isinstance(opinionDict, str) or isinstance(opinionDict, unicode):
            self.load_dictionaries(opinionDict=opinionDict)
        elif isinstance(opinionDict, corpora.Dictionary):
            self.opinionDictionary = opinionDict

        if not topicDict or not opinionDict:
            self._create_corpus_wide_dictionaries()

        self.testSplit = testSplit
        self.nPerspectives = len(self.perspectives)

项目：cptm 作者：NLeSC | 项目源码 | 文件源码

def load_dictionaries(self, topicDict=None, opinionDict=None):
        if topicDict:
            self.topicDictionary = corpora.Dictionary.load(topicDict)
            logger.info('topic dictionary {}'.format(self.topicDictionary))
        if opinionDict:
            self.opinionDictionary = corpora.Dictionary.load(opinionDict)
            logger.info('opinion dictionary {}'.format(self.opinionDictionary))

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}

    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)

    return weighted_phrases[:top_n]

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def train_lsi_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi

项目：OpinionMining728 作者：stasi009 | 项目源码 | 文件源码

def build_dictionary(hotel_files,extra_stopwords=None):
    stream_of_words = words_stream(hotel_files,extra_stopwords)
    dictionary = corpora.Dictionary(stream_of_words)
    dictionary.save(DictionaryFile)  # store the dictionary, for future reference
    print "==================== Dictionary Generated and Saved ===================="

项目：OpinionMining728 作者：stasi009 | 项目源码 | 文件源码

def __init__(self,hotel_files,extra_stopwords = None):
        self._dictionary = corpora.Dictionary.load(DictionaryFile)
        self._hotel_files = hotel_files

项目：OpinionMining728 作者：stasi009 | 项目源码 | 文件源码

def lsi_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_tfidf = corpora.MmCorpus(TfidfFile)

    N_TOPICS = 300
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LSI MODEL IS BUILT ================="

    lsi_model.save(LsiModelFile)
    save_topics(lsi_model,LsiTopicsFile)

项目：OpinionMining728 作者：stasi009 | 项目源码 | 文件源码

def lda_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_bow = corpora.MmCorpus(BowFile)

    N_TOPICS = 100
    model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LDA MODEL IS BUILT ================="

    model.save(LdaModelFile)
    save_topics(model,LdaTopicsFile)

项目：TopicModel 作者：BUAAQingYuan | 项目源码 | 文件源码

def load_corpus(data_file):
    texts = load_texts(data_file)
    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus,dictionary

项目：TopicModel 作者：BUAAQingYuan | 项目源码 | 文件源码

def load_corpus(data_file):
    texts = load_texts(data_file)
    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus = [[token[0] for token in text] for text in corpus]
    return corpus, dictionary

项目：memex-dossier-open 作者：dossier | 项目源码 | 文件源码

def tfidf():
    if not TFIDF:
        return
    doc1 = u'Andrew likes Diet Pepsi.'
    doc2 = u'Andrew knows the muffin man.'
    doc3 = u'Andrew lives near the muffin man on Shirley Lane.'
    corpus = map(sip.noun_phrases, [doc1, doc2, doc3])
    dictionary = corpora.Dictionary(corpus)
    bows = [dictionary.doc2bow(tokens) for tokens in corpus]
    return models.TfidfModel(bows, id2word=dictionary)

项目：email-segmentation 作者：gorgias | 项目源码 | 文件源码

def LSI_fit(self, data):
        '''
        Fits an LSI model and returns it with associated dictionary
        '''
        texts = [[tag for tag in sent] for sent in self.get_pos(data)]
        dictionary = corpora.Dictionary(texts)
        texts = map(dictionary.doc2bow, texts)
        lsi = models.LsiModel(texts, id2word=dictionary, 
                                                    num_topics=self.num_topics)

        return dictionary, lsi

项目：liveqa2017 作者：codekansas | 项目源码 | 文件源码

def train(self, corpus, passes=1):
        """Updates dictionary and model given a corpus.

        Args:
            corpus: list of str, the documents to tokenize.
        """

        if self.dictionary is not None or self.model is not None:
            x = raw_input('You are about to overwrite an existing '
                          'model file (%s). Are you sure? [y/N] '
                          % self.model_file)

            if x[0] != 'y':
                raise RuntimeError('You chose not to overwrite the '
                                   'existing model and dictionary.')

        # Tokenizes the corpus.
        documents = [self.tokenize(document) for document in corpus]

        # Builds a dictionary from the existing documents.
        self.dictionary = corpora.Dictionary(documents)

        # Dumps the dictionary to a pickled file to use later.
        pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))

        # Converts the corpus to tokens.
        corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]

        # Trains the LSI model.
        self.model = models.LdaModel(corpus_bow,
                                     passes=passes,
                                     id2word=self.dictionary,
                                     num_topics=self.num_topics)

        # Saves the model to use later.
        self.model.save(self.model_file)

        # Flag to remember that training has taken place.
        self._trained = True