我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用gensim.models.Doc2Vec()。
def train_word2vec_model(df, columns): model_param = { "alpha": config.EMBEDDING_ALPHA, "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY, "n_epoch": config.EMBEDDING_N_EPOCH, "sg": 1, "hs": 1, "min_count": config.EMBEDDING_MIN_COUNT, "size": config.EMBEDDING_DIM, "sample": 0.001, "window": config.EMBEDDING_WINDOW, "workers": config.EMBEDDING_WORKERS, } model_dir = config.WORD2VEC_MODEL_DIR model_name = "Homedepot-word2vec-D%d-min_count%d.model"%( model_param["size"], model_param["min_count"]) word2vec = DataFrameWord2Vec(df, columns, model_param) word2vec.train() word2vec.save(model_dir, model_name) #---------------------- Doc2Vec ----------------------
def trainDoc2Vector(sentence_count, vector_dimension): # train and save the model sentences = TaggedLineDocument('sources/splited_words.txt') model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count()) model.train(sentences, total_examples=sentence_count, epochs=model.iter) model.save('result/doc2vec.model') # save vectors out = open('result/doc2vec.vector', mode='w+', encoding='utf-8') for index in range(0, sentence_count, 1): docvec = model.docvecs[index] out.write(' '.join(str(f) for f in docvec) + "\n") out.close()
def __init__(self, analyzer=None, matching=None, name=None, verbose=0, n_epochs=10, alpha=0.25, min_alpha=0.05, n_jobs=4, **kwargs): # self.model = model self.alpha = alpha self.min_alpha = min_alpha self.verbose = verbose self.name = "paragraph-vectors" if name is None else name if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching)) self.analyzer = analyzer self.model = Doc2Vec(alpha=alpha, min_alpha=alpha, size=500, window=8, min_count=1, sample=1e-5, workers=n_jobs, negative=20, dm=0, dbow_words=1, # words only with dm!=0? dm_mean=0, # unused when in concat mode dm_concat=1, dm_tag_count=1 ) self.n_epochs = n_epochs self._neighbors = NearestNeighbors(**kwargs)
def fit(self, docs, y): assert len(docs) == len(y) model = self.model n_epochs = self.n_epochs verbose = self.verbose decay = (self.alpha - self.min_alpha) / n_epochs X = [TaggedDocument(self.analyzer(doc), [label]) for doc, label in zip(docs, y)] if verbose > 0: print("First 3 tagged documents:\n", X[:3]) print("Training doc2vec model") # d2v = Doc2Vec() # d2v.build_vocab(X) # if self.intersect is not None: # d2v.intersect_word2vec_format(self.intersect) model.build_vocab(X) for epoch in range(n_epochs): if verbose: print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs)) model.train(X) model.alpha -= decay # apply global decay model.min_alpha = model.alpha # but no decay inside one epoch if verbose > 0: print("Finished.") print("model:", self.model) if self._matching: self._matching.fit(docs) else: # if we dont do matching, its enough to fit a nearest neighbors on # all centroids before query time dvs = np.asarray([model.docvecs[tag] for tag in y]) self._neighbors.fit(dvs) self._y = y return self
def train_and_save_doc2vec(docs, output_file, options = {}): print "Training model..." model = Doc2Vec(docs, **options) model.save(output_file)
def trainingNet(self, window, nDimension): self.nDimension = nDimension sentences = LabeledLineSentence(self.corpus) self.model = Doc2Vec(min_count=1, window=window, size=nDimension, sample=1e-4, negative=5, workers=4) corpus = sentences.to_array() self.model.build_vocab(corpus) for epoch in range(10): self.model.train(sentences.sentences_perm())
def extract_instances(self, train_instances): sentences = [] for idx, train_instance in enumerate(train_instances): sa, sb = train_instance.get_word(type='lemma', lower=True) sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx])) sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx])) model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000) features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['sa_%d' % idx] vec_b = model.docvecs['sb_%d' % idx] feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append([]) # infos.append([vec_a, vec_b]) return features, infos # def load_instances(self, train_instances): # """ # extract cosine distance from already trained feature file # without modify the feature_file # this function's priority is higher that the above extract_instances # """ # # _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file) # features = [] # infos = [] # ''' get features from train instances''' # for _feature in _features: # feature = Feature._feat_string_to_list(_feature, _n_dim) # features.append([feature[1]]) # infos.append(['cosine']) # # features = [ Feature._feat_list_to_string(feature) for feature in features ] # # return features, 1, _n_instance
def __init__(self, df, columns, model_param): super().__init__(df, columns, model_param) self.model = Doc2Vec(dm=self.model_param["dm"], hs=self.model_param["hs"], alpha=self.model_param["alpha"], min_alpha=self.model_param["alpha"], min_count=self.model_param["min_count"], size=self.model_param["size"], sample=self.model_param["sample"], window=self.model_param["window"], workers=self.model_param["workers"])
def train(input_jlgz, *, size, limit, min_df, max_features): print('FAST_VERSION', FAST_VERSION) documents = Documents(input_jlgz, limit=limit) model = Doc2Vec( documents=documents, size=size, min_count=min_df, max_vocab_size=max_features, workers=multiprocessing.cpu_count(), sample=1e-5, ) return model