Python gensim 模块,models() 实例源码

我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用gensim.models()

项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def main():
    #signature()

    # sentences = TextLoader()
    # model = gensim.models.Word2Vec(sentences, workers=8)
    # model.save('word2vector.model')
    # print 'word2vec ok'

    # word2vec = Word2vec()
    # word2vec.BetweenToVec()

    # pro_pro()
    # OneHot()
    #
    # c = cluster()
    # c.Cluster(0.7, 'one_hot_vec.txt', '4_cluster.txt')

    negtive_bet_many()
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def text_to_vector(sentence_list, MAX_SENTENCE=78, model=None):
    if model is None:
        model = models.Word2Vec.load_word2vec_format(
            local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True)
    X = np.zeros((MAX_SENTENCE, len(sentence_list), 300))
    capitals = np.zeros((MAX_SENTENCE, len(sentence_list), 3))
    vectorize = lambda x: model[x] if x in model else np.zeros(300)
    mask = []
    for i, sentence in enumerate(sentence_list):
        for j, word in enumerate(sentence):
            if j == MAX_SENTENCE:
                j -= 1
                break
            X[j][i] = vectorize(word)
            capitals[j][i] = cap_vector(word)
        mask.append(j + 1)
    mask = np.array(mask)
    return X, capitals, mask
项目:NETL-Automatic-Topic-Labelling-    作者:sb1992    | 项目源码 | 文件源码
def get_word(word):
    inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)

    if inst == None:
        length = len(word.split("_"))
        if length < 5:
            return True, word
    else:
        if inst.group(1) != "disambiguation":
            word2 = re.sub(r'_\(.+\)','',word)
            if len(word2.split(" ")) <5:
                return True, word

    return False,word

# Load the trained doc2vec and word2vec models.
项目:CNN-Text-Pairs-Classification    作者:RandolphVI    | 项目源码 | 文件源码
def create_word2vec_model(embedding_size, input_file=TEXT_DIR):
    """
    Create the word2vec model based on the given embedding size and the corpus file.
    :param embedding_size: The embedding size
    :param input_file: The corpus file
    """
    word2vec_file = 'word2vec_' + str(embedding_size) + '.model'

    if os.path.isfile(word2vec_file):
        logging.info('? The word2vec model you want create already exists!')
    else:
        sentences = word2vec.LineSentence(input_file)
        # sg=0 means use CBOW model(default); sg=1 means use skip-gram model.
        model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0,
                                       sg=0, workers=multiprocessing.cpu_count())
        model.save(word2vec_file)
项目:CNN-Text-Pairs-Classification    作者:RandolphVI    | 项目源码 | 文件源码
def load_word2vec_matrix(vocab_size, embedding_size):
    """
    Return the word2vec model matrix.
    :param vocab_size: The vocab size of the word2vec model file
    :param embedding_size: The embedding size
    :return: The word2vec model matrix
    """
    word2vec_file = 'word2vec_' + str(embedding_size) + '.model'

    if os.path.isfile(word2vec_file):
        model = gensim.models.Word2Vec.load(word2vec_file)
        vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
        vector = np.zeros([vocab_size, embedding_size])
        for key, value in vocab.items():
            if len(key) > 0:
                vector[value] = model[key]
        return vector
    else:
        logging.info("? The word2vec file doesn't exist. "
                     "Please use function <create_vocab_size(embedding_size)> to create it!")
项目:contextual-advertising-deploy    作者:andreicnica    | 项目源码 | 文件源码
def __init__(self, classes=None, classesFile=None,
                 classesClusterPath=None,
                 modelPath="dataset/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", modelBinary=True):

        if not(classes) and not(classesFile):
            print "ERROR MUST LOAD CLASS FILE"
            return 202

        if not(classes):
            classes = self.load_adv_keyterms_from_file(classesFile)

        #load cluster
        if classesClusterPath:
            self.classesClusters = load_cluster_dataset(classesClusterPath)
        else:
            #process cluster from classes
            #TODO
            self.classesClusters = None

        self.model = gensim.models.Word2Vec.load_word2vec_format(modelPath, binary=modelBinary)
        self._preProcessClasses(classes)
项目:MusicTaster    作者:JayveeHe    | 项目源码 | 文件源码
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
                               min_count=5, sorted_vocab=1, window=10,
                               size=250,
                               iter_n=50):
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    full_data = []
    for i in input_datas:
        tmp = []
        for j in i:
            tmp.append(j[0])
            tmp.append(j[1])
        full_data.append(tmp)
    data_process_logger.info('start training')
    wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
                                      size=size, iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def main():
    # sentences = TextLoader()
    # model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=5)
    # model.save('word2vector.model')
    # print 'word2vec ok'
    #
    # pro_cluster('error.txt', 'error_cluster_word2vec.txt', 0.2)
    all_cluster()
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def main():

    sentences = TextLoader()
    model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=2)
    model.save('word2vector.model')
    print 'word2vec ok'

    # word2vec = Word2vec()
    # word2vec.BetweenToVec()
项目:adventures-in-ml-code    作者:adventuresinML    | 项目源码 | 文件源码
def create_embedding_matrix(model):
    # convert the wv word vectors into a numpy matrix that is suitable for insertion
    # into our TensorFlow and Keras models
    embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
项目:rna_protein_binding    作者:wentaozhu    | 项目源码 | 文件源码
def train_tag_doc(doc1):
    docs = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, text in enumerate(doc1):
        words = text.lower().split()
        tags = [i]
        docs.append(analyzedDocument(words, tags))
        #docs.append(gensim.models.doc2vec.TaggedDocument(words, [i]))
    return doc
项目:FunFacts    作者:DMTsurel    | 项目源码 | 文件源码
def loadIndexes(self):

        if self.model or self.idfIndex:
            return 
        #global model
        print "loading word2vec model"
        self.model = gensim.models.KeyedVectors.load_word2vec_format("models/GoogleNews-vectors-negative300.bin", binary=True)  # C text format
        print "done"
        #model = gensim.models.Word2Vec.load_word2vec_format("models/glove_model.txt", binary=False)  # C text format

        #global idfIndex
        print "loading idfIndex model"
        self.idfIndex = indexManager.getIndex("plainIdfIndex.txt")
        print "done"
        #return (model, idfIndex)
项目:FunFacts    作者:DMTsurel    | 项目源码 | 文件源码
def test2(self):
        topWords = self.getTopTfIdfTerms("Jerusalem")
        for word in topWords:
            #print word, idfIndex[word] if word in idfIndex else 1.5
            try:
                print self.model.most_similar(positive=[word], topn=10)
            except:
                print "word not in vocabulary"


#print model.accuracy(r"C:\Users\David\workspace\Wiki\gitWiki\questions-words.txt")

#model = word2vec.Word2Vec(sentences)
#model = word2vec.Word2Vec.load_word2vec_format("C:\Users\David\workspace\Wiki\gitWiki\text8-queen", binary=False)
#model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
项目:MusicTaster    作者:JayveeHe    | 项目源码 | 文件源码
def train_song2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250,
                         iter_n=50):
    """
    ??song2vec??
    Args:
        fout_path:
        input_datas:
        data_path:
        min_count:
        sorted_vocab:
        window:
        size:
        iter_n:

    Returns:

    """
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    data_process_logger.info('start training')
    random.shuffle(input_datas)
    input_datas = input_datas[:45000]
    wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
                                      size=size, iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'
项目:MusicTaster    作者:JayveeHe    | 项目源码 | 文件源码
def train_artist2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10,
                           size=250,
                           iter_n=50):
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    data_process_logger.info('start training')
    wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
                                      size=size, iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def load(self):
        # disambiguator params
        print('[{}] Loading <disambiguator_weights>'.format(str(datetime.now())))
        disambiguator_weights = np.load(local_ref('../storage/sentence_disambiguation/trained_weights.npy'))
        print('[{}] Loading <disambiguator_tag_counts>'.format(str(datetime.now())))
        with open(local_ref('../storage/sentence_disambiguation/brown_tag_distribution.pkl')) as fp:
            disambiguator_tag_counts = cPickle.load(fp)
        print('[{}] Loading <disambiguator_tag_order>'.format(str(datetime.now())))
        with open(local_ref('../storage/sentence_disambiguation/brown_tag_order.pkl')) as fp:
            disambiguator_tag_order = cPickle.load(fp)

        # glove embedding params
        print('[{}] Loading <embedder_weights>'.format(str(datetime.now())))
        embedder_weights = np.load(local_ref('../storage/word_embedding/glove_weights_300d.npy'))
        print('[{}] Loading <embedder_vocab>'.format(str(datetime.now())))
        with open(local_ref('../storage/word_embedding/glove_vocab_300d.pkl')) as fp:
            embedder_vocab = cPickle.load(fp)

        # part-of-speech params
        print('[{}] Loading <pos_tagger_weights>'.format(str(datetime.now())))
        pos_tagger_weights = dict(np.load(local_ref('../storage/pos_tagger/pos_trained_weights.npz')))
        print('[{}] Loading <wordvec_model>'.format(str(datetime.now())))
        wordvec_model = models.Word2Vec.load_word2vec_format(
            local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True)

        # NER params
        print('[{}] Loading <ner_gen_params>'.format(str(datetime.now())))
        with open(local_ref('../storage/ner/gen_params_set.pkl')) as fp:
            ner_gen_params = cPickle.load(fp)
        print('[{}] Loading <ner_nn_params>'.format(str(datetime.now())))
        with open(local_ref('../storage/ner/nn_params_set.dill')) as fp:
            ner_nn_params = dill.load(fp)

        # stanford dep parser params
        print('[{}] Loading <dep_path_to_jar>'.format(str(datetime.now())))
        dep_path_to_jar = local_ref('../storage/dependency_parsing/stanford-parser.jar')
        print('[{}] Loading <dep_path_to_models_jar>'.format(str(datetime.now())))
        dep_path_to_models_jar = local_ref('../storage/dependency_parsing/stanford-parser-3.5.2-models.jar')

        self.bank['disambiguator_weights'] = disambiguator_weights
        self.bank['disambiguator_tag_counts'] = disambiguator_tag_counts
        self.bank['disambiguator_tag_order'] = disambiguator_tag_order
        self.bank['embedder_weights'] = embedder_weights
        self.bank['embedder_vocab'] = embedder_vocab
        self.bank['pos_tagger_weights'] = pos_tagger_weights
        self.bank['wordvec_model'] = wordvec_model
        self.bank['ner_gen_params'] = ner_gen_params
        self.bank['ner_nn_params'] = ner_nn_params
        self.bank['dep_path_to_jar'] = dep_path_to_jar
        self.bank['dep_path_to_models_jar'] = dep_path_to_models_jar
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def gen_dataset(sentences,
                categories,
                max_words=78,
                train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output the category (one-hotted)

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'),
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.zeros(300)
    encoder = one_hot_encoding(categories)

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, len(encoder.keys())))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words
    param_dict['encoder'] = encoder

    for sent_i in I:
        words = sentences[sent_i]
        cats = categories[sent_i]

        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words, categories=cats,
                                    vectorizer=vectorizer,
                                    encoder=encoder,
                                    max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = util.split_data(
            X, out_data=I, frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
    return (X, y, K), param_dict
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def gen_dataset(sentences,
                max_words=78,
                train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output the category (one-hotted)

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        '../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatizer = lambda x:  wordnet_lemmatizer.lemmatize(x)

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, 300))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words

    for sent_i in I:
        words = sentences[sent_i]

        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words, vectorizer=vectorizer,
                                    lemmatizer=lemmatizer,
                                    max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = util.split_data(
            X, out_data=I, frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
    return (X, y, K), param_dict
项目:thesis    作者:jonvet    | 项目源码 | 文件源码
def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec):

    # Find words shared between the two vocabularies.
    print("Finding shared words")
    shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab]

    # Select embedding vectors for shared words.
    print("Selecting embeddings for %d shared words" % len(shared_words))
    shared_st_emb = skip_thoughts_emb[[
        skip_thoughts_vocab[w] for w in shared_words]]
    shared_w2v_emb = word2vec[shared_words]

    # Train a linear regression model on the shared embedding vectors.
    print("Training linear regression model")
    model = sklearn.linear_model.LinearRegression()
    model.fit(shared_w2v_emb, shared_st_emb)

    # Create the expanded vocabulary.
    print("Creating embeddings for expanded vocabulary")
    embedding_map = collections.OrderedDict()
    print('Length of word2vec vocabulary: %d\n' % len(word2vec.vocab))
    for i, w in enumerate(word2vec.vocab):
        print('\rEmbedding %d' %(i+1), end='   ')
    # Ignore words with underscores (spaces).
        if "_" not in w:
            w_emb = model.predict(word2vec[w].reshape(1, -1))
            embedding_map[w] = w_emb.reshape(-1)

    for w in skip_thoughts_vocab:
        embedding_map[w] = skip_thoughts_emb[skip_thoughts_vocab[w]]

    print("Created expanded vocabulary of %d words", len(embedding_map))

    expanded_vocab = {}
    expanded_embeddings = np.zeros([len(embedding_map), paras.embedding_size])

    for i, w in enumerate(embedding_map.keys()):
        expanded_vocab[w] = i
        expanded_embeddings[i,:] = embedding_map[w]

    print('Saving expanded vocab and embeddings')
    with open(path + 'expanded_vocab.pkl', 'wb') as f:
        pkl.dump(expanded_vocab, f)

    embeddings_file = os.path.join(path, "expanded_embeddings.npy")
    np.save(embeddings_file, expanded_embeddings)

    return expanded_vocab, expanded_embeddings

# path = '../models/toronto_n5/'
项目:rna_protein_binding    作者:wentaozhu    | 项目源码 | 文件源码
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
    min_count = 5
    dim = 50
    window = 5

    print('dim: ' + str(dim) + ', window: ' + str(window))
    seq_dict = read_fasta_file(seq_file)

    #text = seq_dict.values()
    tris = get_6_trids()
    sentences = []
    for seq in seq_dict.values():
        seq = seq.replace('T', 'U')
        bag_sen = []
        bag_seqs = split_overlap_seq(seq)
        for new_seq in bag_seqs:
            trvec = get_4_nucleotide_composition(tris, new_seq)
            bag_sen.append(trvec)
        #for aa in range(len(text)):
        sentences.append(bag_sen)
    #pdb.set_trace()
    print(len(sentences))
    model = None
    docs = train_tag_doc(sentences)
    #model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
    #model =  gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
    model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
    model.build_vocab(docs)
    model.train(docs)
    '''vocab = list(model.vocab.keys())
    print vocab
    fw = open('rna_doc_dict', 'w')
    for val in vocab:
        fw.write(val + '\n')
    fw.close()
    #print model.syn0
    #pdb.set_trace()
    embeddingWeights = np.empty([len(vocab), dim])

    for i in range(len(vocab)):
        embeddingWeights[i,:] = model[vocab[i]]  

    allWeights.append(embeddingWeights)

    '''
    #model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
    #with open(outfile, 'w') as f:
    #    pickle.dump(model, f)
    # store the model to mmap-able files
    pdb.set_trace()
    model.save(outfile)
    # load the model back
    #model_loaded = Doc2Vec.load(outfile)
项目:conll16st-hd-sdp    作者:tbmihailov    | 项目源码 | 文件源码
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
                                                                    train_x, train_y_txt,
                                                                    train_y_relation_types, save_model_file):


        """
        Filters items by given params, trains the classifier and saves the word2vec_model to a file.
        Args:
            classifier_name: Name of the classifier used for saving the models
            class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
            relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
            train_x: Train samples
            train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
            train_y_relation_types: Train type indicators if sample is explicit or implicit.
            Only items with relation_type will be used for training
            save_model_file: Name of the file in which the word2vec_model will be saved
        Returns:
            Filters items and trains classifier
        """
        logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)

        train_x_curr = []
        train_y_curr = []

        # Filtering items
        logging.info('Filtering %s items...' % len(train_x))
        start = time.time()
        for i in range(0, len(train_x)):
            if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
                train_x_curr.append(train_x[i])
                train_y_curr.append(class_mapping_curr[train_y_txt[i]])
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Training
        # Classifier params
        classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                                 degree=3, gamma='auto', kernel='rbf',
                                 max_iter=-1, probability=False, random_state=None, shrinking=True,
                                 tol=0.001, verbose=False)
        print 'Classifier:\n%s' % classifier_current

        start = time.time()
        logging.info('Training with %s items...' % len(train_x_curr))
        classifier_current.fit(train_x_curr, train_y_curr)
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Saving word2vec_model
        pickle.dump(classifier_current, open(save_model_file, 'wb'))
        logging.info('Model saved to %s' % save_model_file)
项目:conll16st-hd-sdp    作者:tbmihailov    | 项目源码 | 文件源码
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
                                                                    train_x, train_y_txt,
                                                                    train_y_relation_types, save_model_file):


        """
        Filters items by given params, trains the classifier and saves the word2vec_model to a file.
        Args:
            classifier_name: Name of the classifier used for saving the models
            class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
            relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
            train_x: Train samples
            train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
            train_y_relation_types: Train type indicators if sample is explicit or implicit.
            Only items with relation_type will be used for training
            save_model_file: Name of the file in which the word2vec_model will be saved
        Returns:
            Filters items and trains classifier
        """
        logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)

        train_x_curr = []
        train_y_curr = []

        # Filtering items
        logging.info('Filtering %s items...' % len(train_x))
        start = time.time()
        for i in range(0, len(train_x)):
            if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
                train_x_curr.append(train_x[i])
                train_y_curr.append(class_mapping_curr[train_y_txt[i]])
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Training
        # Classifier params
        #classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        #                         degree=3, gamma='auto', kernel='rbf',
        #                         max_iter=-1, probability=False, random_state=None, shrinking=True,
        #                         tol=0.001, verbose=False)

        param_c = 0.1
        classifier_current = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=param_c, fit_intercept=True,
                                                intercept_scaling=1, class_weight=None, random_state=None,
                                                solver='liblinear',
                                                max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=8)
        print 'Classifier:\n%s' % classifier_current

        start = time.time()
        logging.info('Training with %s items...' % len(train_x_curr))
        classifier_current.fit(train_x_curr, train_y_curr)
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Saving word2vec_model
        pickle.dump(classifier_current, open(save_model_file, 'wb'))
        logging.info('Model saved to %s' % save_model_file)
项目:conll16st-hd-sdp    作者:tbmihailov    | 项目源码 | 文件源码
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
                                                                    train_x, train_y_txt,
                                                                    train_y_relation_types, save_model_file):


        """
        Filters items by given params, trains the classifier and saves the word2vec_model to a file.
        Args:
            classifier_name: Name of the classifier used for saving the models
            class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
            relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
            train_x: Train samples
            train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
            train_y_relation_types: Train type indicators if sample is explicit or implicit.
            Only items with relation_type will be used for training
            save_model_file: Name of the file in which the word2vec_model will be saved
        Returns:
            Filters items and trains classifier
        """
        logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)

        train_x_curr = []
        train_y_curr = []

        # Filtering items
        logging.info('Filtering %s items...' % len(train_x))
        start = time.time()
        for i in range(0, len(train_x)):
            if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
                train_x_curr.append(train_x[i])
                train_y_curr.append(class_mapping_curr[train_y_txt[i]])
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Training
        # Classifier params
        classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                                 degree=3, gamma='auto', kernel='rbf',
                                 max_iter=-1, probability=False, random_state=None, shrinking=True,
                                 tol=0.001, verbose=False)
        print 'Classifier:\n%s' % classifier_current

        start = time.time()
        logging.info('Training with %s items...' % len(train_x_curr))
        classifier_current.fit(train_x_curr, train_y_curr)
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Saving word2vec_model
        pickle.dump(classifier_current, open(save_model_file, 'wb'))
        logging.info('Model saved to %s' % save_model_file)
项目:conll16st-hd-sdp    作者:tbmihailov    | 项目源码 | 文件源码
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
                                                                    train_x, train_y_txt,
                                                                    train_y_relation_types, save_model_file):


        """
        Filters items by given params, trains the classifier and saves the word2vec_model to a file.
        Args:
            classifier_name: Name of the classifier used for saving the models
            class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
            relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
            train_x: Train samples
            train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
            train_y_relation_types: Train type indicators if sample is explicit or implicit.
            Only items with relation_type will be used for training
            save_model_file: Name of the file in which the word2vec_model will be saved
        Returns:
            Filters items and trains classifier
        """
        logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)

        train_x_curr = []
        train_y_curr = []

        # Filtering items
        logging.info('Filtering %s items...' % len(train_x))
        start = time.time()
        for i in range(0, len(train_x)):
            if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
                train_x_curr.append(train_x[i])
                train_y_curr.append(class_mapping_curr[train_y_txt[i]])
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Training
        # Classifier params
        classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                                 degree=3, gamma='auto', kernel='rbf',
                                 max_iter=-1, probability=False, random_state=None, shrinking=True,
                                 tol=0.001, verbose=False)
        print 'Classifier:\n%s' % classifier_current

        start = time.time()
        logging.info('Training with %s items...' % len(train_x_curr))
        classifier_current.fit(train_x_curr, train_y_curr)
        end = time.time()
        logging.info("Done in %s s" % (end - start))

        # Saving word2vec_model
        pickle.dump(classifier_current, open(save_model_file, 'wb'))
        logging.info('Model saved to %s' % save_model_file)