Python gensim.models 模块,Phrases() 实例源码

我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用gensim.models.Phrases()

项目:TextSummarization    作者:g-deoliveira    | 项目源码 | 文件源码
def __init__(self, num_topics=100, min_word_count=20, 
                 top_most_common_words=10, min_doc_length=40, 
                 max_doc_length=1000, random_state=None):
        self.num_topics = num_topics
        self.min_word_count = min_word_count
        self.top_most_common_words = top_most_common_words

        assert max_doc_length > min_doc_length, \
               "max_doc_length must be greater than min_doc_length"
        self.min_doc_length = min_doc_length
        self.max_doc_length = max_doc_length
        self.random_state = random_state

        # natural language processing
        self.stop_words = self.getEnglishStopWords()
        self.bigramizer = Phrases()
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def add_phrases(self, corpus):
        '''
        Parameters
        ----------
        corpus: Corpus for phrase augmentation

        Returns
        -------
        New ParsedCorpus containing unigrams in corpus and new phrases
        '''
        assert isinstance(corpus, ParsedCorpus)
        self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]

        for i in range(1, self.max_tokens_per_phrase):
            self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))

        return self
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def load_save_word2vec_model(line_words, model_filename):
    # ????
    feature_size = 500
    content_window = 5
    freq_min_count = 3
    # threads_num = 4
    negative = 3   #best????hierarchical softmax??(??????????)????negative sampling??(??????)?
    iter = 20

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(model_filename):
        model = models.Word2Vec.load(model_filename)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        bigram_transformer = models.Phrases(line_words)
        model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc-tic))
        model.save(model_filename)
        # model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    return model
项目:false-friends    作者:pln-fing-udelar    | 项目源码 | 文件源码
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4):
    options = {
        'size': size,
    }

    if use_plain_word2vec:
        if phrases_n_gram > 1:
            phrases_file_name = '{}.phrases'.format(in_file_name)
            word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
            in_file_name = phrases_file_name

        if threads:
            options['threads'] = threads

        # noinspection PyCallingNonCallable
        word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
    else:
        sentences = LineSentence(in_file_name)
        for i in range(phrases_n_gram - 1):
            n_gram_transformer = Phrases(sentences)
            sentences = n_gram_transformer[sentences]

        if threads:
            options['workers'] = threads

        model = Word2Vec(sentences, **options)
        model.save(out_file_name)
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def __init__(self, phrases, gram_size):
        '''
        Parameters
        ----------
        phrases : list[gensim.models.Phrases]
        gram_size : int, maximum number of words per phrase
        kwargs : parameters for FeatsFromSpacyDoc.init
        '''
        print('xxx')
        phrases = phrases
        gram_size = gram_size
        assert type(phrases) == Phrases
        self.gram_size = gram_size
        self.phrases = phrases
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def _scan_and_build_vocab(self):
        from gensim.models import Phrases
        bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
        self.model.scan_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
        self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
项目:stance-conditional    作者:sheffieldnlp    | 项目源码 | 文件源码
def trainPhrasesModel(tweets):
    """
    Train phrases model, experimental, not used
    :param tweets: list of tokenised tweets
    :return:
    """
    print("Learning multiword expressions")
    bigram = Phrases(tweets)
    bigram.save("../out/phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump , go hillary clinton , i like jesus , jesus , legalisation abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[tweets]
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def __init__(self, lang, tokenizer=None, load=True):
        self.lang = lang
        self.tokenizer = tokenizer or Tokenizer(lang)
        dirname = join(nlp_data, lang)
        dict_fname = join(dirname, DICTIONARY_FNAME)
        phrase_fname = join(dirname, PHRASES_FNAME)
        if load and exists(phrase_fname):
            self.phrases = gmodels.Phrases.load(phrase_fname)
        else:
            self.phrases = gmodels.Phrases()
        if load and exists(dict_fname):
            self.dictionary = corpora.Dictionary.load(dict_fname)
        else:
            self.dictionary = corpora.Dictionary()