我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用gensim.models.word2vec()。
def train_batch_score_cbow_xy_generator(model, scored_word_sentences): for scored_word_sentence in scored_word_sentences: #print scored_word_sentence scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, scored_word in enumerate(scored_word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)] xy_gen=train_cbow_pair(model, scored_word[0] , word2_indices , None, None) for xy in xy_gen: if xy !=None: xy1=[xy[0],xy[1],xy[2],[scored_word[1]]] yield xy1 # if xy !=None: # xy1=[xy[0],xy[1],xy[2],scored_word[1]] # yield xy1
def __init__(self, fname='data/korean_word2vec', dim=300): self.dim = dim try: # load saved model print('Loading korean word2vec model') self.model = word2vec.Word2Vec.load(fname) except: print(':: There is no word2vec model')
def extract_countries(): countries_vec = {} vec = word2vec.Word2Vec.load("word2vec") for line in open("../chapter09/countries.txt", "r"): country = line.strip().replace(" ", "_") if country in vec.vocab.keys(): countries_vec[country] = vec[country] return countries_vec
def __init__(self, file_name, dim=300): self.dim = dim try: print('Loading english word2vec model') self.word2vec_model = word2vec.Word2Vec.load(file_name) except: print('Error while loading word2vec model')
def load_embedding(data, embedding_file, binary=True, prefix=None, file_name='embedding.pkl'): """ :param data: :param embedding_file: :param binary: :param prefix: if prefix is None, then write to file_name, else load from prefix :param file_name: :return: """ if prefix == None: vocab = sorted(reduce(lambda x, y: x | y, (set(sentence) for sentence, _ in data))) word_idx = dict((c, i) for i, c in enumerate(vocab)) vocab_size = len(word_idx) + 1 # +1 for nil word # "/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin" model = word2vec.Word2Vec.load_word2vec_format(embedding_file, binary=binary) embedding = [] for c in word_idx: if c in model: embedding.append(model[c]) else: embedding.append(np.random.uniform(0.1, 0.1, 300)) embedding = np.array(embedding, dtype=np.float32) with open(file_name, 'wb') as f: pickle.dump(embedding, f) pickle.dump(vocab_size, f) pickle.dump(word_idx, f) else: with open(prefix, 'rb') as f: embedding = pickle.load(f) vocab_size = pickle.load(f) word_idx = pickle.load(f) return vocab_size, word_idx, embedding
def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256): batch_count=0 sub_batch_count=0 train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_y =np.zeros((batch_size,sub_batch_size),dtype='int8') while 1: for sentence in sentences: word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) #window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)]) #print window_length, for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) for xy in xy_gen : if xy !=None: (x0,x1,y)=xy train_x0[batch_count][sub_batch_count]=x0 train_x1[batch_count][sub_batch_count]=x1 train_y[batch_count][sub_batch_count]=y sub_batch_count += 1 if sub_batch_count >= sub_batch_size : batch_count += 1 sub_batch_count=0 if batch_count >= batch_size : yield { 'index':train_x0, 'point':train_x1, 'code':train_y} batch_count=0
def train_batch_cbow_xy_generator(model, sentences): for sentence in sentences: word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] xy_gen=train_cbow_pair(model, word , word2_indices , None, None) for xy in xy_gen: if xy !=None: yield xy
def train_batch_score_sg(model, scored_word_sentences, score_vector_size, alpha=None, work=None, sub_batch_size=256, batch_size=256): batch_count=0 sub_batch_count=0 train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_y0 =np.zeros((batch_size,sub_batch_size),dtype='int8') train_y1 =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32') # train_x0=[[0]]*batch_size # train_x1=[[0]]*batch_size # train_y0=[[0]]*batch_size # train_y1=[[0]]*batch_size while 1: for scored_word_sentence in scored_word_sentences: #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence] word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, scored_word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code word=scored_word2word(scored_word) # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): word2=scored_word2word(scored_word2) # don't train on the `word` itself if pos2 != pos: xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha) for xy in xy_gen : if xy !=None: (x0,x1,y0)=xy y1=scored_word2score(scored_word) train_x0[batch_count][sub_batch_count]=x0 train_x1[batch_count][sub_batch_count]=x1 train_y0[batch_count][sub_batch_count]=y0 train_y1[batch_count][sub_batch_count]=y1 sub_batch_count += 1 if sub_batch_count >= sub_batch_size : batch_count += 1 sub_batch_count=0 if batch_count >= batch_size : yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1} batch_count=0 # train_x0[batch_count]=[x0] # train_x1[batch_count]=x1 # train_y0[batch_count]=y0 # train_y1[batch_count]=y1 # #print train_x0,train_y1, # batch_count += 1 # if batch_count >= batch_size : # #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} # #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)} # yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} # batch_count=0