我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用gensim.models.Phrases()。
def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases()
def add_phrases(self, corpus): ''' Parameters ---------- corpus: Corpus for phrase augmentation Returns ------- New ParsedCorpus containing unigrams in corpus and new phrases ''' assert isinstance(corpus, ParsedCorpus) self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')] for i in range(1, self.max_tokens_per_phrase): self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)])) return self
def load_save_word2vec_model(line_words, model_filename): # ???? feature_size = 500 content_window = 5 freq_min_count = 3 # threads_num = 4 negative = 3 #best????hierarchical softmax??(??????????)????negative sampling??(??????)? iter = 20 print("word2vec...") tic = time.time() if os.path.isfile(model_filename): model = models.Word2Vec.load(model_filename) print(model.vocab) print("Loaded word2vec model") else: bigram_transformer = models.Phrases(line_words) model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count()) toc = time.time() print("Word2vec completed! Elapsed time is %s." % (toc-tic)) model.save(model_filename) # model.save_word2vec_format(save_model2, binary=False) print("Word2vec Saved!") return model
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4): options = { 'size': size, } if use_plain_word2vec: if phrases_n_gram > 1: phrases_file_name = '{}.phrases'.format(in_file_name) word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True) in_file_name = phrases_file_name if threads: options['threads'] = threads # noinspection PyCallingNonCallable word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options) else: sentences = LineSentence(in_file_name) for i in range(phrases_n_gram - 1): n_gram_transformer = Phrases(sentences) sentences = n_gram_transformer[sentences] if threads: options['workers'] = threads model = Word2Vec(sentences, **options) model.save(out_file_name)
def __init__(self, phrases, gram_size): ''' Parameters ---------- phrases : list[gensim.models.Phrases] gram_size : int, maximum number of words per phrase kwargs : parameters for FeatsFromSpacyDoc.init ''' print('xxx') phrases = phrases gram_size = gram_size assert type(phrases) == Phrases self.gram_size = gram_size self.phrases = phrases
def _scan_and_build_vocab(self): from gensim.models import Phrases bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus)) self.model.scan_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)]) self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
def trainPhrasesModel(tweets): """ Train phrases model, experimental, not used :param tweets: list of tokenised tweets :return: """ print("Learning multiword expressions") bigram = Phrases(tweets) bigram.save("../out/phrase_all.model") print("Sanity checking multiword expressions") test = "i like donald trump , go hillary clinton , i like jesus , jesus , legalisation abortion " sent = test.split(" ") print(bigram[sent]) return bigram[tweets]
def __init__(self, lang, tokenizer=None, load=True): self.lang = lang self.tokenizer = tokenizer or Tokenizer(lang) dirname = join(nlp_data, lang) dict_fname = join(dirname, DICTIONARY_FNAME) phrase_fname = join(dirname, PHRASES_FNAME) if load and exists(phrase_fname): self.phrases = gmodels.Phrases.load(phrase_fname) else: self.phrases = gmodels.Phrases() if load and exists(dict_fname): self.dictionary = corpora.Dictionary.load(dict_fname) else: self.dictionary = corpora.Dictionary()