我们从Python开源项目中,提取了以下40个代码示例,用于说明如何使用gensim.models.Word2Vec()。
def run(): ''' ???? ''' reload(sys) sys.setdefaultencoding('utf8') program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp1 = r'wiki_model' outp2 = r'vector.txt' model = Word2Vec(sentences, size=400, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False) testData = ['??','??','??','??'] for i in testData: temp = model.most_similar(i) for j in temp: print '%f %s'%(j[1],j[0]) print ''
def trainWord2Vector(sentence_count, vector_dimension, train_count): lines, model_out, vector_out = "sources/splited_words.txt", "result/word2vec.model", "result/pre_word2vec.vector" logging.info("??????") sentences = LineSentence(lines) # ??min_count=3??????3?? ????????????word2vec.vector? # workers????????????CPU?? ???3 # sg????????? model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8, min_count=0, workers=multiprocessing.cpu_count()) # ????? ?????? for i in range(train_count): model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter) # trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model.save(model_out) model.wv.save_word2vec_format(vector_out)
def trainWord2Vector(sentence_count, vector_dimension, train_count): lines, model_out, vector_out = "com/com/test1/test1sources/splited_words.txt", \ "com/com/test1/test1sources/word2vec.model", \ "com/com/test1/test1sources/word2vec.vector" logging.info("??????") sentences = LineSentence(lines) # ??min_count=3??????3?? ????????????word2vec.vector? # workers????????????CPU?? ???3 model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8, min_count=0, workers=multiprocessing.cpu_count()) # ????? ?????? for i in range(train_count): model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter) # trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model.save(model_out) model.wv.save_word2vec_format(vector_out)
def uptrain(corpus, model_path=None, binary=True, lockf=0.0, min_count=1, size=300, **word2vec_params): wv = Word2Vec(min_count=min_count, size=size, **word2vec_params) print("Building vocabulary...") wv.build_vocab(corpus) print("Found %d distinct words." % len(wv.index2word)) if model_path is not None: print("Intersecting with", model_path, "...") wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf) print("Intersected vectors locked with", lockf) total_examples = len(corpus) print("Training on %d documents..." % total_examples) wv.train(corpus, total_examples=total_examples) return wv
def create(basedir, num_workers=12, size=320, threshold=5): """ Creates a word2vec model using the Gensim word2vec implementation. :param basedir: the dir from which to get the documents. :param num_workers: the number of workers to use for training word2vec :param size: the size of the resulting vectors. :param threshold: the frequency threshold. :return: the model. """ logging.basicConfig(level=logging.INFO) sentences = SentenceIter(root=basedir) model = Word2Vec(sentences=sentences, sg=True, size=size, workers=num_workers, min_count=threshold, window=11, negative=15) model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab") return model
def train_save(self, list_csv): sentences = MySentences(list_csv) num_features = 256 min_word_count = 1 num_workers = 20 context = 5 epoch = 20 sample = 1e-5 model = Word2Vec( sentences, size=num_features, min_count=min_word_count, workers=num_workers, sample=sample, window=context, iter=epoch, ) #model.save(model_fn) return model
def main(positive, negative, topn): """ This method train word2vec model, and return most similar tags Args: positive (list): list of positive tags negative (list): list of negative tags topn (int): number of top keywords in word2vec Returns: list: Return list of word2vec """ with open('tags.txt') as f: content = f.readlines() sentences = [x.split() for x in content] model = Word2Vec(sentences, min_count=20) return model.most_similar(positive=positive, negative=negative, topn=topn)
def fit_embeddings(self, documents): """ Train word embeddings of the classification model, using the same parameter values for classification on Gensim ``Word2Vec``. Similar to use a pre-trained model. :param documents: """ params = self.get_params() del params['pre_trained'] del params['bucket'] # Word2Vec has not softmax if params['loss'] == 'softmax': params['loss'] = 'hs' LabeledWord2Vec.init_loss(LabeledWord2Vec(), params, params['loss']) del params['loss'] w2v = Word2Vec(sentences=documents, **params) self._classifier = LabeledWord2Vec.load_from(w2v)
def learn_embeddings(self, output): """ Learn embeddings by optimizing the Skipgram objective using SGD. """ self._simulate_walks() # simulate random walks model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0, workers=self.workers, iter=self.iter, negative=25, sg=1) print("defined model using w2v") model.wv.save_word2vec_format(output, binary=True) print("saved model in word2vec binary format") return
def training_word2vec(): sentences = [] read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba) label_dir_list = os.listdir(read_dir_path) for label_dir in label_dir_list: label_dir_path = os.path.join(read_dir_path,label_dir) label_file_list = os.listdir(label_dir_path) for label_file in label_file_list: with open(os.path.join(label_dir_path,label_file),'rb') as reader: word_list = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip() sentences.append(word_list) model_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.word2Vect_path) if not os.path.exists(model_path): os.makedirs(model_path) model_save_path = os.path.join(model_path,sogou_classfication.model_name) model = Word2Vec(sentences,max_vocab_size=None,window=8,size=256,min_count=5,workers=4,iter=20) model.save(model_save_path)
def load_save_word2vec_model(line_words, model_filename): # ???? feature_size = 500 content_window = 5 freq_min_count = 3 # threads_num = 4 negative = 3 #best????hierarchical softmax??(??????????)????negative sampling??(??????)? iter = 20 print("word2vec...") tic = time.time() if os.path.isfile(model_filename): model = models.Word2Vec.load(model_filename) print(model.vocab) print("Loaded word2vec model") else: bigram_transformer = models.Phrases(line_words) model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count()) toc = time.time() print("Word2vec completed! Elapsed time is %s." % (toc-tic)) model.save(model_filename) # model.save_word2vec_format(save_model2, binary=False) print("Word2vec Saved!") return model
def learn_embeddings(self, output, output_format='binary'): """ Learn embeddings by optimizing the Skipgram objective using SGD. """ self._simulate_walks() # simulate random walks model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0, workers=self.workers, iter=self.iter, negative=25, sg=1) print("defined model using w2v") is_binary = output_format != 'text' model.wv.save_word2vec_format(output, binary=is_binary) actual_format = 'text' if output_format == 'text' else 'binary' print("saved model in word2vec %s format" % actual_format) return
def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = Word2Vec( size=size, window=window, min_count=min_count, workers=n_workers, sample=1e-5, negative=negative ) nlp = spacy.load(lang, parser=False, tagger=False, entity=False) corpus = Corpus(in_dir) total_words = 0 total_sents = 0 for text_no, text_loc in enumerate(iter_dir(corpus.directory)): with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() total_sents += text.count('\n') doc = nlp(text) total_words += corpus.count_doc(doc) logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types", text_no, total_words, len(corpus.strings)) model.corpus_count = total_sents model.raw_vocab = defaultdict(int) for orth, freq in corpus.counts: if freq >= min_count: model.raw_vocab[nlp.vocab.strings[orth]] = freq model.scale_vocab() model.finalize_vocab() model.iter = nr_iter model.train(corpus) model.save(out_loc)
def gen_embeddings(in_file, out_file, size=100): corpus = LineSentence(in_file) model = Word2Vec( sentences=corpus, size=size, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1 ) model.save_word2vec_format(out_file, binary=False)
def fit(self, tokens): # get most frequent items for plotting: tokens = [t.lower() for t in tokens] self.mfi = [t for t,_ in Counter(tokens).most_common(self.nb_mfi)] self.sentence_iterator = SentenceIterator(tokens=tokens) # train embeddings: self.w2v_model = Word2Vec(self.sentence_iterator, window=self.window, min_count=self.minimum_count, size=self.size, workers=self.nb_workers, negative=self.nb_negative) self.plot_mfi() self.most_similar() # build an index of the train tokens # which occur at least min_count times: self.token_idx = {'<UNK>': 0} for k, v in Counter(tokens).items(): if v >= self.minimum_count: self.token_idx[k] = len(self.token_idx) # create an ordered vocab: self.train_token_vocab = [k for k, v in sorted(self.token_idx.items(),\ key=itemgetter(1))] self.pretrained_embeddings = self.get_weights(self.train_token_vocab) return self
def zhword2vec(ifname, fmodel): '''Training the word2vec word more: http://radimrehurek.com/gensim/models/word2vec.html ''' model = Word2Vec(LineSentence(ifname), size = 400, window = 5, min_count = 2, workers = multiprocessing.cpu_count(),negative = 5) model.save(fmodel) # model.save_word2vec_format(fword2vec, binary=False)
def train_model(self, ofmodel, space = ' '): if self.traincorpusfname == None or not os.path.exists(): ifname = self.__pretrain_model(space) else: ifname = self.traincorpusfname self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n') # # Calling Gensim 3rdparty lib, Training the word2vec word # more: http://radimrehurek.com/gensim/models/word2vec.html model = Word2Vec(LineSentence(ifname), size = 400, window = 5, min_count = 2, workers = multiprocessing.cpu_count(),negative = 5) self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n') model.save(ofmodel) return (model, ofmodel) # if __name__=='__main__': # if len(sys.argv) < 3: # print(globals()['__doc__'] %locals()) # sys.exit(1) # inp, outp =sys.argv[1:3] # #inp = '../../data/zhwiki-latest-pages-articles.xml.bz2','r' # #outp = '../../model/word2vec.model' # wiki = tWikiCorpus(inp, _lemmatize=False, _dictionary={}) # print 'wiki' # wiki.getTexts(outp, space=' ')
def train_model(self, ofmodel, space = ' '): if self.traincorpusfname == None or not os.path.exists(self.traincorpusfname): ifname = self.pretrain_model(space) else: ifname = self.traincorpusfname self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n') # # Calling Gensim 3rdparty lib, Training the word2vec word # more: http://radimrehurek.com/gensim/models/word2vec.html model = Word2Vec(LineSentence(ifname), size = 400, window = 5, min_count = 2, workers = multiprocessing.cpu_count(),negative = 5) self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n') model.save(ofmodel) return (model, ofmodel)
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'): model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format(model_save_file_name, binary=False)
def train_and_save(sents, output_file, options = {}): print "Training model..." model = Word2Vec(sents, **options) model.save(output_file)
def __init__(self, loss='softmax', bucket=0, **kwargs): """ Exactly as the parent class `Word2Vec <https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec>`_. Some parameter values are overwritten (e.g. sg=0 because we never use skip-gram here), look at the code for details. Argument names must be explicit! `loss` = one value in {ns, hs, softmax}. If "ns" is selected negative sampling will be used as loss function, together with the parameter `negative`. With "hs" hierarchical softmax will be used, while with "softmax" (default) the sandard softmax function (the other two are "approximations"). The `hs` argument does not exist anymore. `bucket` is the maximum number of hashed words, i.e., we limit the feature space to this number, ergo we use the hashing trick in the word vocabulary. Default to 0, NO hashing trick It basically builds two vocabularies, one for the sample words and one for the labels, so that the input layer is only made of words, while the output layer is only made of labels. **Parent class methods that are not overridden here are not tested and not safe to use**. """ self.lvocab = {} # Vocabulary of labels only self.index2label = [] kwargs['sg'] = 0 kwargs['window'] = sys.maxsize kwargs['sentences'] = None kwargs['hashfxn'] = custom_hash # Force a consistent function across different Python versions self.softmax = self.init_loss(kwargs, loss) self.bucket = bucket super(LabeledWord2Vec, self).__init__(**kwargs)
def train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ if self.bucket > 0: sentences = HashIter(sentences, self.bucket, with_labels=True) if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. return super(LabeledWord2Vec, self).train(sentences, total_words, word_count, total_examples, queue_factor, report_delay)
def load_from(cls, other_model): """ Import data and parameter values from other model :param other_model: A ``LabeledWord2Vec`` object, or a ``Word2Vec`` or ``KeyedVectors`` object of Gensim """ softmax = getattr(other_model, 'softmax', False) if softmax: loss = 'softmax' elif not other_model.hs and other_model.negative: loss = 'ns' else: loss = 'hs' new_model = LabeledWord2Vec( loss=loss, negative=other_model.negative if loss == 'ns' else 0, size=other_model.vector_size, seed=other_model.seed ) new_model.reset_from(other_model) for attr in vars(other_model): if hasattr(new_model, attr): if not isinstance(other_model, LabeledWord2Vec) and (attr == 'syn1' or attr == 'syn1neg'): continue value = getattr(other_model, attr, getattr(new_model, attr)) if isinstance(value, KeyedVectors): new_model.wv.syn0 = value.syn0 new_model.wv.syn0norm = value.syn0norm else: setattr(new_model, attr, value) return new_model
def load_w2v(corpus, dictionary): ''' Return the trained Word2Vec model Train a model if model doesn't exist yet :param corpus: :param dictionary: :return: ''' if not os.path.isfile(W2V_MODEL_PATH): num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count num_workers = 5 # Number of threads to run in parallel window = 5 # Context window size downsampling = 1e-5 # Downsample setting for frequent words print("Training the word2vec model!") sents = get_review_sentences() # Initialize and train the model (this will take some time) model = models.Word2Vec(sents, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = window, sample = downsampling) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load() model.save(W2V_MODEL_PATH) tfidf = models.Word2Vec(corpus) print('Word2vec model created!') print('Loading word2vec model') w2v = models.Word2Vec.load(W2V_MODEL_PATH) print('Loading word2vec model complished!') return w2v
def main(): load_sequence('/home/beki/Documents/2nd Year/BD & DM Project/retail_dataset.csv') # split patterns to train_patterns and test_patterns train_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.8)) test_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.2)) # Word vector representation learning model = Word2Vec(train_patterns, size=15, window=3, min_count=1, workers=1, iter=3, sample=1e-4, negative=20) # Test test_size = float(len(test_patterns)) hit = 0.0 for current_pattern in test_patterns: if len(current_pattern) < 2: test_size -= 1.0 continue # Reduce the current pattern in the test set by removing the last item last_item = current_pattern.pop() # Keep those items in the reduced current pattern, which are also in the models vocabulary items = [it for it in current_pattern if it in model.vocab] if len(items) <= 2: test_size -= 1.0 continue # Predict the most similar items to items prediction = model.most_similar(positive=items) # Check if the item that we have removed from the test, last_item, is among # the predicted ones. for predicted_item, score in prediction: if predicted_item == last_item: hit += 1.0 #print last_item #print prediction print 'Accuracy like measure: {}'.format(hit / test_size)
def learn_embeddings(): ''' Learn embeddings by optimizing the Skipgram objective using SGD. ''' logging.info("Initializing creation of the representations...") walks = LineSentence('random_walks.txt') model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter) model.wv.save_word2vec_format(args.output) logging.info("Representations created.") return
def make_word2vec(): data_path = tv_classfication.tv_data_path sentence = data_work(data_path) model = Word2Vec(sentence,size=256,workers=4,window=10,iter=30) model.save(tv_classfication.word2vec_path)
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) #################################################################################################################################################################### # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah) #################################################################################################################################################################### # call the build_deepwalk_corpus function # Take and populate the arguments from the command lines. generated_walks = graph.build_deepwalk_corpus(G = G, num_paths = args.number_walks, path_length = args.walk_length, alpha=0, rand=random.Random(0)) # Call word2vec to build the model. # print generated_walks # The structure Looks like ['32173', '32168'], ['124010', '22676'], ['17792', '72925'], model = Word2Vec(generated_walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) #################################################################################################################################################################### # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah) #################################################################################################################################################################### # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth] print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1,6)) print cm
def word2vec_train(input_file, output_file): sentences = word2vec.LineSentence(input_file) model = Word2Vec(sentences, size=300, min_count=10, sg=0, workers=multiprocessing.cpu_count()) model.save(output_file) model.save_word2vec_format(output_file + '.vector', binary=True)
def train(): extract_sentece() in_path = './Data/corpus/sentence.txt' out_path = './Data/embedding/word2vec.bin' # ???? model = Word2Vec( sg=1, sentences=LineSentence(in_path), size=256, window=5, min_count=3, workers=4, iter=40) model.wv.save_word2vec_format(out_path, binary=True)
def train_word2vec(self, min_count=10, size=100, window=5, workers=3): self.word2vec_model = Word2Vec(Word2vecCorpus(self.corpus_file), min_count=min_count, size=size, window=window, workers=workers)
def __init__(self, df, columns, model_param): self.df = df self.columns = columns self.model_param = model_param self.model = Word2Vec(sg=self.model_param["sg"], hs=self.model_param["hs"], alpha=self.model_param["alpha"], min_alpha=self.model_param["alpha"], min_count=self.model_param["min_count"], size=self.model_param["size"], sample=self.model_param["sample"], window=self.model_param["window"], workers=self.model_param["workers"])
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #unsupervised data hashtag_tweets = 'tweets/hashtag_tweets.gz' files = [hashtag_tweets] sentences = MySentences(files=files) model = models.Word2Vec(sentences, size=100, window=5, min_count=15, workers=8,sg=1,sample=1e-5,hs=1) model.save_word2vec_format('embeddings/hashtag_tweets_embedding',binary=False)
def main(in_dir, out_loc, task=1, size=128, window=5, min_count=10, n_workers=4, hs=1, nr_iter=5): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = Word2Vec( sg=task, size=size, window=window, min_count=min_count, workers=n_workers, hs=1, iter=nr_iter ) corpus = Corpus(in_dir) total_words = 0 total_sents = 0 for text_no, text_loc in enumerate(iter_dir(corpus.directory)): with io.open(text_loc, 'r', encoding='utf8') as file_: try: text = file_.read() except UnicodeDecodeError: print(text_loc) total_sents += text.count('\n') total_words += corpus.count_doc(text.split()) logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types", text_no, total_words, len(corpus.strings)) model.corpus_count = total_sents model.raw_vocab = defaultdict(int) for key, string in corpus.strings.items(): model.raw_vocab[string] = corpus.counts[key] model.scale_vocab() model.finalize_vocab() model.iter = nr_iter model.train(corpus) # Trims down model model.init_sims(replace=True) model.save(out_loc)
def train(self, **kargs) : self.config.update(kargs) self.model = _Word2Vec(list(self.database.sentences), **self.config) delattr(self, "database")
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'): min_count = 5 dim = 50 window = 5 print('dim: ' + str(dim) + ', window: ' + str(window)) seq_dict = read_fasta_file(seq_file) #text = seq_dict.values() tris = get_6_trids() sentences = [] for seq in seq_dict.values(): seq = seq.replace('T', 'U') bag_sen = [] bag_seqs = split_overlap_seq(seq) for new_seq in bag_seqs: trvec = get_4_nucleotide_composition(tris, new_seq) bag_sen.append(trvec) #for aa in range(len(text)): sentences.append(bag_sen) #pdb.set_trace() print(len(sentences)) model = None docs = train_tag_doc(sentences) #model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100) #model = gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4) model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50) model.build_vocab(docs) model.train(docs) '''vocab = list(model.vocab.keys()) print vocab fw = open('rna_doc_dict', 'w') for val in vocab: fw.write(val + '\n') fw.close() #print model.syn0 #pdb.set_trace() embeddingWeights = np.empty([len(vocab), dim]) for i in range(len(vocab)): embeddingWeights[i,:] = model[vocab[i]] allWeights.append(embeddingWeights) ''' #model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires']) #with open(outfile, 'w') as f: # pickle.dump(model, f) # store the model to mmap-able files pdb.set_trace() model.save(outfile) # load the model back #model_loaded = Doc2Vec.load(outfile)
def load_w2v(self): """ Load Word2Vec embeddings from P2FA files and pre-trained Word2Vec KeyedVectors text file and store them in the directory path mentioned in self.embedding_dir. :returns segment wise feature dictionary for embeddings :Note: Do not provide KeyedVector file in binary format """ from gensim.models.keyedvectors import KeyedVectors from gensim.models import Word2Vec is_binary = True if self.embed_model_type == "binary" else False model = KeyedVectors.load_word2vec_format(self.embed_model_path, binary = is_binary ) print "Word2Vec model Loaded" self.embed_model = model self.embed_length = model.vector_size if not self.word_dict: self.load_words() features = {} system("mkdir -p "+self.embedding_dir) for video_id, video_word_data in self.word_dict.iteritems(): video_feats = {} for segment_id, segment_word_data in video_word_data.iteritems(): video_feats[segment_id] = [] for word_feat in segment_word_data: start, end, word = word_feat try: embed = self.embed_model[word] except: embed = np.zeros(self.embed_length) video_feats[segment_id].append((start, end, embed)) fname = video_id+"_"+segment_id+".csv" fpath = join(self.embedding_dir, fname) with open(fpath,"wb") as fh: # Writing each feature in csv file for segment for f in video_feats[segment_id]: f_start = str(f[0]) f_end = str(f[1]) f_val = [str(val) for val in f[2].tolist()] str2write = ",".join([f_start, f_end] + f_val) str2write += "\n" fh.write(str2write) features[video_id] = video_feats return features
def makeFeature(df_features): now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print ('get sentence vector') model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) # model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False) # model = Word2Vec(brown.sents()) df_features['vec1'] = df_features.q1_expand.map(lambda x: getVec(x, model)) df_features['vec2'] = df_features.q2_expand.map(lambda x: getVec(x, model)) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print ('get six kinds of coefficient about vector') df_features['f_cosine'] = df_features.apply(lambda x: Cosine(x['vec1'], x['vec2']), axis=1) df_features['f_manhatton'] = df_features.apply(lambda x: Manhatton(x['vec1'], x['vec2']), axis=1) df_features['f_euclidean'] = df_features.apply(lambda x: Euclidean(x['vec1'], x['vec2']), axis=1) df_features['f_pearson'] = df_features.apply(lambda x: PearsonSimilar(x['vec1'], x['vec2']), axis=1) df_features['f_spearman'] = df_features.apply(lambda x: SpearmanSimilar(x['vec1'], x['vec2']), axis=1) df_features['f_kendall'] = df_features.apply(lambda x: KendallSimilar(x['vec1'], x['vec2']), axis=1) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print ('get 3 kinds of coefficient about from w2c 2 document') df_features['f_cosine_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Cosine, model), axis=1) df_features['f_euclidean_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Euclidean, model), axis=1) df_features['f_manhatton_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Manhatton, model), axis=1) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print ('get three kinds of coefficient about nouns, verb, adj') df_features['f_raw_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1'], x['question2']), axis=1) df_features['f_raw_dice'] = df_features.apply(lambda x: Dice(x['question1'], x['question2']),axis=1) df_features['f_raw_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1'], x['question2']), axis=1) df_features['f_expand_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['q1_expand'], x['q2_expand']), axis=1) df_features['f_expand_dice'] = df_features.apply(lambda x: Dice(x['q1_expand'], x['q2_expand']),axis=1) df_features['f_expand_ochiai'] = df_features.apply(lambda x: Ochiai(x['q1_expand'], x['q2_expand']), axis=1) df_features['f_nouns_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_nouns'], x['question2_nouns']), axis=1) df_features['f_nouns_dice'] = df_features.apply(lambda x: Dice(x['question1_nouns'], x['question2_nouns']),axis=1) df_features['f_nouns_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_nouns'], x['question2_nouns']), axis=1) df_features['f_verbs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_verbs'], x['question2_verbs']), axis=1) df_features['f_verbs_dice'] = df_features.apply(lambda x: Dice(x['question1_verbs'], x['question2_verbs']),axis=1) df_features['f_verbs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_verbs'], x['question2_verbs']), axis=1) df_features['f_adjs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_adjs'], x['question2_adjs']), axis=1) df_features['f_adjs_dice'] = df_features.apply(lambda x: Dice(x['question1_adjs'], x['question2_adjs']),axis=1) df_features['f_adjs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_adjs'], x['question2_adjs']), axis=1) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print ('get weighted overlap about expand') weights = word_weights(df_features) df_features['f_weighted_overlap'] = df_features.apply(lambda x: weighted_Overlap(x['q1_expand'], x['q2_expand'], weights), axis=1) print('all done') now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') df_features.fillna(0.0) return df_features
def get_word_embeddings(num_dimensions=500, cache_loc=EMBEDDINGS_FILE): """Generates word embeddings. Args: num_dimensions: int, number of embedding dimensions. cache_loc: str, where to cache the word embeddings. Returns: numpy array representing the embeddings, with shape (NUM_TOKENS, num_dimensions). """ if os.path.exists(cache_loc): embeddings = np.load(cache_loc) else: class SentenceGenerator(object): def __iter__(self): iterable = itertools.islice(iterate_qa_pairs(), 1000000) for i, (question, answer) in enumerate(iterable, 1): q, a, _, _ = tokenize(question=question, answer=answer, use_pad=False, include_rev=False) yield [str(w) for w in q] yield [str(w) for w in a] del q, a, w if i % 1000 == 0: sys.stderr.write('\rprocessed %d' % i) sys.stderr.flush() sys.stderr.write('\rprocessed %d\n' % i) sys.stderr.flush() # The default embeddings. embeddings = np.random.normal(size=(NUM_TOKENS, num_dimensions)) sentences = SentenceGenerator() model = models.Word2Vec(sentences, size=num_dimensions) word_vectors = model.wv del model # Puts the Word2Vec weights into the right order. weights = word_vectors.syn0 vocab = word_vectors.vocab for k, v in vocab.items(): embeddings[int(k)] = weights[v.index] with open(cache_loc, 'wb') as f: np.save(f, embeddings) pass assert embeddings.shape == (NUM_TOKENS, num_dimensions) return embeddings
def get_global_embeddings(self, filenames, embedding_size, embedding_dir): """ Construct the Embedding Matrix for the sentences in filenames. Args: filenames: File names of the training files: Based on which the vocab will be built. This is used when there are no pretrained embeddings present. Then instead of using random embeddings, Word2Vec algorithm is used to train the embeddings on the dataset avaliable. embedding_size: Dimensions for the embedding to be used. Returns Embedding matrix. """ sentences = [] if (os.path.exists(embedding_dir + 'vocab_len.pkl')): vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl")) else: vocab_len_stored = 0 if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")): print ("Load file") self.embeddings = pickle.load(open(embedding_dir + "embeddings.pkl")) return None if (os.path.exists(embedding_dir + 'embeddings') == True): model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False) print ("Loading pretriained embeddings") else: for file in filenames: with open(file, 'rb') as f: for lines in f: words = [lines.split()] sentences.extend(words) model = Word2Vec(sentences, size=embedding_size, min_count=0) model.save(embedding_dir + 'embeddings') self.embeddings_model = model return model