我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用gensim.models()。
def main(): #signature() # sentences = TextLoader() # model = gensim.models.Word2Vec(sentences, workers=8) # model.save('word2vector.model') # print 'word2vec ok' # word2vec = Word2vec() # word2vec.BetweenToVec() # pro_pro() # OneHot() # # c = cluster() # c.Cluster(0.7, 'one_hot_vec.txt', '4_cluster.txt') negtive_bet_many()
def text_to_vector(sentence_list, MAX_SENTENCE=78, model=None): if model is None: model = models.Word2Vec.load_word2vec_format( local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True) X = np.zeros((MAX_SENTENCE, len(sentence_list), 300)) capitals = np.zeros((MAX_SENTENCE, len(sentence_list), 3)) vectorize = lambda x: model[x] if x in model else np.zeros(300) mask = [] for i, sentence in enumerate(sentence_list): for j, word in enumerate(sentence): if j == MAX_SENTENCE: j -= 1 break X[j][i] = vectorize(word) capitals[j][i] = cap_vector(word) mask.append(j + 1) mask = np.array(mask) return X, capitals, mask
def get_word(word): inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word) if inst == None: length = len(word.split("_")) if length < 5: return True, word else: if inst.group(1) != "disambiguation": word2 = re.sub(r'_\(.+\)','',word) if len(word2.split(" ")) <5: return True, word return False,word # Load the trained doc2vec and word2vec models.
def create_word2vec_model(embedding_size, input_file=TEXT_DIR): """ Create the word2vec model based on the given embedding size and the corpus file. :param embedding_size: The embedding size :param input_file: The corpus file """ word2vec_file = 'word2vec_' + str(embedding_size) + '.model' if os.path.isfile(word2vec_file): logging.info('? The word2vec model you want create already exists!') else: sentences = word2vec.LineSentence(input_file) # sg=0 means use CBOW model(default); sg=1 means use skip-gram model. model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0, sg=0, workers=multiprocessing.cpu_count()) model.save(word2vec_file)
def load_word2vec_matrix(vocab_size, embedding_size): """ Return the word2vec model matrix. :param vocab_size: The vocab size of the word2vec model file :param embedding_size: The embedding size :return: The word2vec model matrix """ word2vec_file = 'word2vec_' + str(embedding_size) + '.model' if os.path.isfile(word2vec_file): model = gensim.models.Word2Vec.load(word2vec_file) vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) vector = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if len(key) > 0: vector[value] = model[key] return vector else: logging.info("? The word2vec file doesn't exist. " "Please use function <create_vocab_size(embedding_size)> to create it!")
def __init__(self, classes=None, classesFile=None, classesClusterPath=None, modelPath="dataset/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", modelBinary=True): if not(classes) and not(classesFile): print "ERROR MUST LOAD CLASS FILE" return 202 if not(classes): classes = self.load_adv_keyterms_from_file(classesFile) #load cluster if classesClusterPath: self.classesClusters = load_cluster_dataset(classesClusterPath) else: #process cluster from classes #TODO self.classesClusters = None self.model = gensim.models.Word2Vec.load_word2vec_format(modelPath, binary=modelBinary) self._preProcessClasses(classes)
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) full_data = [] for i in input_datas: tmp = [] for j in i: tmp.append(j[0]) tmp.append(j[1]) full_data.append(tmp) data_process_logger.info('start training') wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
def main(): # sentences = TextLoader() # model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=5) # model.save('word2vector.model') # print 'word2vec ok' # # pro_cluster('error.txt', 'error_cluster_word2vec.txt', 0.2) all_cluster()
def main(): sentences = TextLoader() model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=2) model.save('word2vector.model') print 'word2vec ok' # word2vec = Word2vec() # word2vec.BetweenToVec()
def create_embedding_matrix(model): # convert the wv word vectors into a numpy matrix that is suitable for insertion # into our TensorFlow and Keras models embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim)) for i in range(len(model.wv.vocab)): embedding_vector = model.wv[model.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix
def train_tag_doc(doc1): docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, text in enumerate(doc1): words = text.lower().split() tags = [i] docs.append(analyzedDocument(words, tags)) #docs.append(gensim.models.doc2vec.TaggedDocument(words, [i])) return doc
def loadIndexes(self): if self.model or self.idfIndex: return #global model print "loading word2vec model" self.model = gensim.models.KeyedVectors.load_word2vec_format("models/GoogleNews-vectors-negative300.bin", binary=True) # C text format print "done" #model = gensim.models.Word2Vec.load_word2vec_format("models/glove_model.txt", binary=False) # C text format #global idfIndex print "loading idfIndex model" self.idfIndex = indexManager.getIndex("plainIdfIndex.txt") print "done" #return (model, idfIndex)
def test2(self): topWords = self.getTopTfIdfTerms("Jerusalem") for word in topWords: #print word, idfIndex[word] if word in idfIndex else 1.5 try: print self.model.most_similar(positive=[word], topn=10) except: print "word not in vocabulary" #print model.accuracy(r"C:\Users\David\workspace\Wiki\gitWiki\questions-words.txt") #model = word2vec.Word2Vec(sentences) #model = word2vec.Word2Vec.load_word2vec_format("C:\Users\David\workspace\Wiki\gitWiki\text8-queen", binary=False) #model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
def train_song2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): """ ??song2vec?? Args: fout_path: input_datas: data_path: min_count: sorted_vocab: window: size: iter_n: Returns: """ if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) data_process_logger.info('start training') random.shuffle(input_datas) input_datas = input_datas[:45000] wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
def train_artist2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) data_process_logger.info('start training') wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
def load(self): # disambiguator params print('[{}] Loading <disambiguator_weights>'.format(str(datetime.now()))) disambiguator_weights = np.load(local_ref('../storage/sentence_disambiguation/trained_weights.npy')) print('[{}] Loading <disambiguator_tag_counts>'.format(str(datetime.now()))) with open(local_ref('../storage/sentence_disambiguation/brown_tag_distribution.pkl')) as fp: disambiguator_tag_counts = cPickle.load(fp) print('[{}] Loading <disambiguator_tag_order>'.format(str(datetime.now()))) with open(local_ref('../storage/sentence_disambiguation/brown_tag_order.pkl')) as fp: disambiguator_tag_order = cPickle.load(fp) # glove embedding params print('[{}] Loading <embedder_weights>'.format(str(datetime.now()))) embedder_weights = np.load(local_ref('../storage/word_embedding/glove_weights_300d.npy')) print('[{}] Loading <embedder_vocab>'.format(str(datetime.now()))) with open(local_ref('../storage/word_embedding/glove_vocab_300d.pkl')) as fp: embedder_vocab = cPickle.load(fp) # part-of-speech params print('[{}] Loading <pos_tagger_weights>'.format(str(datetime.now()))) pos_tagger_weights = dict(np.load(local_ref('../storage/pos_tagger/pos_trained_weights.npz'))) print('[{}] Loading <wordvec_model>'.format(str(datetime.now()))) wordvec_model = models.Word2Vec.load_word2vec_format( local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True) # NER params print('[{}] Loading <ner_gen_params>'.format(str(datetime.now()))) with open(local_ref('../storage/ner/gen_params_set.pkl')) as fp: ner_gen_params = cPickle.load(fp) print('[{}] Loading <ner_nn_params>'.format(str(datetime.now()))) with open(local_ref('../storage/ner/nn_params_set.dill')) as fp: ner_nn_params = dill.load(fp) # stanford dep parser params print('[{}] Loading <dep_path_to_jar>'.format(str(datetime.now()))) dep_path_to_jar = local_ref('../storage/dependency_parsing/stanford-parser.jar') print('[{}] Loading <dep_path_to_models_jar>'.format(str(datetime.now()))) dep_path_to_models_jar = local_ref('../storage/dependency_parsing/stanford-parser-3.5.2-models.jar') self.bank['disambiguator_weights'] = disambiguator_weights self.bank['disambiguator_tag_counts'] = disambiguator_tag_counts self.bank['disambiguator_tag_order'] = disambiguator_tag_order self.bank['embedder_weights'] = embedder_weights self.bank['embedder_vocab'] = embedder_vocab self.bank['pos_tagger_weights'] = pos_tagger_weights self.bank['wordvec_model'] = wordvec_model self.bank['ner_gen_params'] = ner_gen_params self.bank['ner_nn_params'] = ner_nn_params self.bank['dep_path_to_jar'] = dep_path_to_jar self.bank['dep_path_to_models_jar'] = dep_path_to_models_jar
def gen_dataset(sentences, categories, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output the category (one-hotted) Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True) vectorizer = lambda x: model[x] if x in model else np.zeros(300) encoder = one_hot_encoding(categories) X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, len(encoder.keys()))) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words param_dict['encoder'] = encoder for sent_i in I: words = sentences[sent_i] cats = categories[sent_i] if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, categories=cats, vectorizer=vectorizer, encoder=encoder, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = util.split_data( X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict
def gen_dataset(sentences, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output the category (one-hotted) Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( '../storage/pos_tagger/GoogleNews-vectors-negative300.bin', binary=True) vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON wordnet_lemmatizer = WordNetLemmatizer() lemmatizer = lambda x: wordnet_lemmatizer.lemmatize(x) X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, 300)) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words for sent_i in I: words = sentences[sent_i] if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, vectorizer=vectorizer, lemmatizer=lemmatizer, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = util.split_data( X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict
def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec): # Find words shared between the two vocabularies. print("Finding shared words") shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab] # Select embedding vectors for shared words. print("Selecting embeddings for %d shared words" % len(shared_words)) shared_st_emb = skip_thoughts_emb[[ skip_thoughts_vocab[w] for w in shared_words]] shared_w2v_emb = word2vec[shared_words] # Train a linear regression model on the shared embedding vectors. print("Training linear regression model") model = sklearn.linear_model.LinearRegression() model.fit(shared_w2v_emb, shared_st_emb) # Create the expanded vocabulary. print("Creating embeddings for expanded vocabulary") embedding_map = collections.OrderedDict() print('Length of word2vec vocabulary: %d\n' % len(word2vec.vocab)) for i, w in enumerate(word2vec.vocab): print('\rEmbedding %d' %(i+1), end=' ') # Ignore words with underscores (spaces). if "_" not in w: w_emb = model.predict(word2vec[w].reshape(1, -1)) embedding_map[w] = w_emb.reshape(-1) for w in skip_thoughts_vocab: embedding_map[w] = skip_thoughts_emb[skip_thoughts_vocab[w]] print("Created expanded vocabulary of %d words", len(embedding_map)) expanded_vocab = {} expanded_embeddings = np.zeros([len(embedding_map), paras.embedding_size]) for i, w in enumerate(embedding_map.keys()): expanded_vocab[w] = i expanded_embeddings[i,:] = embedding_map[w] print('Saving expanded vocab and embeddings') with open(path + 'expanded_vocab.pkl', 'wb') as f: pkl.dump(expanded_vocab, f) embeddings_file = os.path.join(path, "expanded_embeddings.npy") np.save(embeddings_file, expanded_embeddings) return expanded_vocab, expanded_embeddings # path = '../models/toronto_n5/'
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'): min_count = 5 dim = 50 window = 5 print('dim: ' + str(dim) + ', window: ' + str(window)) seq_dict = read_fasta_file(seq_file) #text = seq_dict.values() tris = get_6_trids() sentences = [] for seq in seq_dict.values(): seq = seq.replace('T', 'U') bag_sen = [] bag_seqs = split_overlap_seq(seq) for new_seq in bag_seqs: trvec = get_4_nucleotide_composition(tris, new_seq) bag_sen.append(trvec) #for aa in range(len(text)): sentences.append(bag_sen) #pdb.set_trace() print(len(sentences)) model = None docs = train_tag_doc(sentences) #model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100) #model = gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4) model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50) model.build_vocab(docs) model.train(docs) '''vocab = list(model.vocab.keys()) print vocab fw = open('rna_doc_dict', 'w') for val in vocab: fw.write(val + '\n') fw.close() #print model.syn0 #pdb.set_trace() embeddingWeights = np.empty([len(vocab), dim]) for i in range(len(vocab)): embeddingWeights[i,:] = model[vocab[i]] allWeights.append(embeddingWeights) ''' #model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires']) #with open(outfile, 'w') as f: # pickle.dump(model, f) # store the model to mmap-able files pdb.set_trace() model.save(outfile) # load the model back #model_loaded = Doc2Vec.load(outfile)
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type, train_x, train_y_txt, train_y_relation_types, save_model_file): """ Filters items by given params, trains the classifier and saves the word2vec_model to a file. Args: classifier_name: Name of the classifier used for saving the models class_mapping_curr: Class mapping to map train_y_txt to int. Filters items relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only train_x: Train samples train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict train_y_relation_types: Train type indicators if sample is explicit or implicit. Only items with relation_type will be used for training save_model_file: Name of the file in which the word2vec_model will be saved Returns: Filters items and trains classifier """ logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name) train_x_curr = [] train_y_curr = [] # Filtering items logging.info('Filtering %s items...' % len(train_x)) start = time.time() for i in range(0, len(train_x)): if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type: train_x_curr.append(train_x[i]) train_y_curr.append(class_mapping_curr[train_y_txt[i]]) end = time.time() logging.info("Done in %s s" % (end - start)) # Training # Classifier params classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) print 'Classifier:\n%s' % classifier_current start = time.time() logging.info('Training with %s items...' % len(train_x_curr)) classifier_current.fit(train_x_curr, train_y_curr) end = time.time() logging.info("Done in %s s" % (end - start)) # Saving word2vec_model pickle.dump(classifier_current, open(save_model_file, 'wb')) logging.info('Model saved to %s' % save_model_file)
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type, train_x, train_y_txt, train_y_relation_types, save_model_file): """ Filters items by given params, trains the classifier and saves the word2vec_model to a file. Args: classifier_name: Name of the classifier used for saving the models class_mapping_curr: Class mapping to map train_y_txt to int. Filters items relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only train_x: Train samples train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict train_y_relation_types: Train type indicators if sample is explicit or implicit. Only items with relation_type will be used for training save_model_file: Name of the file in which the word2vec_model will be saved Returns: Filters items and trains classifier """ logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name) train_x_curr = [] train_y_curr = [] # Filtering items logging.info('Filtering %s items...' % len(train_x)) start = time.time() for i in range(0, len(train_x)): if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type: train_x_curr.append(train_x[i]) train_y_curr.append(class_mapping_curr[train_y_txt[i]]) end = time.time() logging.info("Done in %s s" % (end - start)) # Training # Classifier params #classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, # degree=3, gamma='auto', kernel='rbf', # max_iter=-1, probability=False, random_state=None, shrinking=True, # tol=0.001, verbose=False) param_c = 0.1 classifier_current = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=param_c, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=8) print 'Classifier:\n%s' % classifier_current start = time.time() logging.info('Training with %s items...' % len(train_x_curr)) classifier_current.fit(train_x_curr, train_y_curr) end = time.time() logging.info("Done in %s s" % (end - start)) # Saving word2vec_model pickle.dump(classifier_current, open(save_model_file, 'wb')) logging.info('Model saved to %s' % save_model_file)