Python sklearn.preprocessing 模块,normalize() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.normalize()

项目:visually-grounded-speech    作者:gchrupala    | 项目源码 | 文件源码
def test_homonym(H, sent, features, C=1.0):
    X_0 = features(matching(sent, H[0]))
    X_1 = features(matching(sent, H[1]))
    y_0 = numpy.zeros(len(X_0))
    y_1 = numpy.ones(len(X_1))
    X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
    y = numpy.hstack([y_0, y_1])
    classifier = LogisticRegression(C=C)
    fold = StratifiedKFold(y, n_folds=10)
    score = []
    count = []
    for tr, te in fold:
        X_tr, X_te = X[tr], X[te]
        y_tr, y_te = y[tr], y[te]
        classifier.fit(X_tr, y_tr)
        score.append(sum(classifier.predict(X_te) == y_te))
        count.append(len(y_te))
    score = numpy.array(score, dtype='float')
    count = numpy.array(count, dtype='float')
    result = {'word1_count': len(y_0),
              'word2_count': len(y_1),
              'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
              'kfold_acc': score/count }
    return result
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def make_tfidf(arr):
    '''input, numpy array with flavor counts for each recipe and compounds
    return numpy array adjusted as tfidf
    '''
    arr2 = arr.copy()
    N=arr2.shape[0]
    l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1)
    l2_rows[l2_rows==0]=1
    arr2_norm = arr2/l2_rows

    arr2_freq = np.sum(arr2_norm>0, axis=0)
    arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0

    from sklearn.preprocessing import normalize
    tfidf = np.multiply(arr2_norm, arr2_idf)
    tfidf = normalize(tfidf, norm='l2', axis=1)
    print tfidf.shape
    return tfidf
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def make_tfidf(arr):
    '''input, numpy array with flavor counts for each recipe and compounds
    return numpy array adjusted as tfidf
    '''
    arr2 = arr.copy()
    N=arr2.shape[0]
    l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1)
    l2_rows[l2_rows==0]=1
    arr2_norm = arr2/l2_rows

    arr2_freq = np.sum(arr2_norm>0, axis=0)
    arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0

    from sklearn.preprocessing import normalize
    tfidf = np.multiply(arr2_norm, arr2_idf)
    tfidf = normalize(tfidf, norm='l2', axis=1)
    print tfidf.shape
    return tfidf
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def __init__(self, path, words=[], dim=300, normalize=True, **kwargs):
        seen = []
        vs = {}
        for line in open(path):
            split = line.split()
            w = split[0]
            if words == [] or w in words:
                if len(split) != dim+1:
                    continue
                seen.append(w)
                vs[w] = np.array(map(float, split[1:]), dtype='float32')
        self.iw = seen
        self.wi = {w:i for i,w in enumerate(self.iw)}
        self.m = np.vstack(vs[w] for w in self.iw)
        if normalize:
            self.normalize()
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def get_subembed(self, word_list, normalize=False, restrict_context=True):
        """
        Gets subembedding.
        """
        w_set = set(self.iw)
        valid_w = [word for word in word_list if word in w_set]
        new_w_indices = np.array([self.wi[word] for word in valid_w])
        if restrict_context:
            c_set = set(self.ic)
            valid_c = [word for word in word_list if word in c_set]
            new_c_indices = np.array([self.ci[word] for word in valid_c])
            new_m = self.m[new_w_indices, :]
            new_m = new_m[:, new_c_indices]
        else:
            valid_c = self.ic
            new_m = self.m[new_w_indices, :]
        return Explicit(new_m, valid_w, valid_c, normalize=normalize)
项目:geomdn    作者:afshinrahimi    | 项目源码 | 文件源码
def get_local_words(preds, vocab, NEs=[], k=50):
    """
    given the word probabilities over many coordinates,
    first normalize the probability of each word in different
    locations to get a probability distribution, then compute
    the entropy of the word's distribution over all coordinates
    and return the words that are low entropy and are not
    named entities.
    """
    #normalize the probabilites of each vocab using entropy
    normalized_preds = normalize(preds, norm='l1', axis=0)
    entropies = stats.entropy(normalized_preds)
    sorted_indices = np.argsort(entropies)
    sorted_local_words = np.array(vocab)[sorted_indices].tolist()


    filtered_local_words = []
    NEset = set(NEs)
    for word in sorted_local_words:
        if word in NEset: continue
        filtered_local_words.append(word)
    return filtered_local_words[0:k]
项目:Deep-subspace-clustering-networks    作者:panji1990    | 项目源码 | 文件源码
def post_proC(C, K, d, alpha):
    # C: coefficient matrix, K: number of clusters, d: dimension of each subspace
    C = 0.5*(C + C.T)
    r = d*K + 1
    U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
    U = U[:,::-1]    
    S = np.sqrt(S[::-1])
    S = np.diag(S)    
    U = U.dot(S)    
    U = normalize(U, norm='l2', axis = 1)       
    Z = U.dot(U.T)
    Z = Z * (Z>0)    
    L = np.abs(Z ** alpha) 
    L = L/L.max()   
    L = 0.5 * (L + L.T)    
    spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
    spectral.fit(L)
    grp = spectral.fit_predict(L) + 1
    return grp, L
项目:Deep-subspace-clustering-networks    作者:panji1990    | 项目源码 | 文件源码
def post_proC(C, K, d, alpha):
    # C: coefficient matrix, K: number of clusters, d: dimension of each subspace
    C = 0.5*(C + C.T)
    r = min(d*K + 1, C.shape[0]-1)      
    U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
    U = U[:,::-1]    
    S = np.sqrt(S[::-1])
    S = np.diag(S)    
    U = U.dot(S)    
    U = normalize(U, norm='l2', axis = 1)       
    Z = U.dot(U.T)
    Z = Z * (Z>0)    
    L = np.abs(Z ** alpha) 
    L = L/L.max()   
    L = 0.5 * (L + L.T)    
    spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
    spectral.fit(L)
    grp = spectral.fit_predict(L) + 1
    return grp, L
项目:Deep-subspace-clustering-networks    作者:panji1990    | 项目源码 | 文件源码
def post_proC(C, K, d, alpha):
    # C: coefficient matrix, K: number of clusters, d: dimension of each subspace
    n = C.shape[0]
    C = 0.5*(C + C.T)    
    C = C - np.diag(np.diag(C)) + np.eye(n,n) # for sparse C, this step will make the algorithm more numerically stable
    r = d*K + 1     
    U, S, _ = svds(C,r,v0 = np.ones(n))
    U = U[:,::-1] 
    S = np.sqrt(S[::-1])
    S = np.diag(S)
    U = U.dot(S)
    U = normalize(U, norm='l2', axis = 1)  
    Z = U.dot(U.T)
    Z = Z * (Z>0)
    L = np.abs(Z ** alpha)
    L = L/L.max()
    L = 0.5 * (L + L.T) 
    spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed', assign_labels='discretize')
    spectral.fit(L)
    grp = spectral.fit_predict(L) + 1
    return grp, L
项目:Deep-subspace-clustering-networks    作者:panji1990    | 项目源码 | 文件源码
def post_proC(C, K, d, alpha):
    # C: coefficient matrix, K: number of clusters, d: dimension of each subspace
    C = 0.5*(C + C.T)
    r = d*K + 1 
    U, S, _ = svds(C,r,v0 = np.ones(C.shape[0]))
    U = U[:,::-1] 
    S = np.sqrt(S[::-1])
    S = np.diag(S)
    U = U.dot(S)
    U = normalize(U, norm='l2', axis = 1)  
    Z = U.dot(U.T)
    Z = Z * (Z>0)
    L = np.abs(Z ** alpha)
    L = L/L.max()
    L = 0.5 * (L + L.T) 
    spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize')
    spectral.fit(L)
    grp = spectral.fit_predict(L) + 1 
    return grp, L
项目:avito-contest    作者:fmilepe    | 项目源码 | 文件源码
def rede_neural(X, y):
    print("Iniciando treinamento da Rede Neural")

    X2 = normalize(X)

    clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5,
                        learning_rate='constant',tol=1e-8,learning_rate_init=0.0002,
                        early_stopping=True,validation_fraction=0.2)

    kf = KFold(len(y),n_folds=3)
    i = 0
    for train,test in kf:
        start = time.time()
        i = i + 1
        print("Treinamento",i)

        # dividindo dataset em treino e test
        #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
        X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test]

        # fit
        clf.fit(X_train, y_train)
        print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )")
    return clf
项目:StageDP    作者:EastonWang    | 项目源码 | 文件源码
def vectorize(features, vocab):
    """ Transform a features list into a numeric vector
        with a given vocab

    :type dpvocab: dict
    :param dpvocab: vocab for distributional representation

    :type projmat: scipy.lil_matrix
    :param projmat: projection matrix for disrep
    """
    vec = lil_matrix((1, len(vocab)))

    for feat in features:
        try:
            fidx = vocab[feat]
            vec[0, fidx] += 1.0
        except KeyError:
            pass
    # Normalization
    vec = normalize(vec)
    return vec
项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def fit(self, X_raw, y=None):
        cents = self.vect.fit_transform(X_raw)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # cents = all_but_the_top(cents, 1)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        # print("Renormalizing")
        # normalize(cents, copy=False)
        # print("Largest singular value: {:.2f}".format(
        #     np.linalg.norm(cents, ord=2)))
        self.centroids = cents
        print(' FIT centroids shape', self.centroids.shape)

        self._y = y
        if self.matching:
            self.matching.fit(X_raw)
        else:
            self.nn.fit(cents)
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_lsi():

    cache_dir = check_cache()
    n_components = 2

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0)
    assert lsi_res.components_.shape[0] == 5
    assert lsi_res.components_.shape[1] == fe.n_features_
    assert lsi._load_pars() is not None
    lsi._load_model()
    X_lsi = lsi._load_features()

    assert_allclose(normalize(X_lsi), X_lsi)

    lsi.list_models()
    lsi.delete()
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
    cache_dir = check_cache()
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
                    use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()

    assert_allclose(normalize(res2).data, res2.data)  # data is l2 normalized

    fe.delete()
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_feature_extraction_weighting(weighting,
                                      use_hashing):
    cache_dir = check_cache()

    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup(weighting=weighting, use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), \
        "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()
    assert_allclose(normalize(res2).data, res2.data)  # data is l2 normalized

    fe.delete()
项目:Msc_Multi_label_ZeroShot    作者:thomasSve    | 项目源码 | 文件源码
def load_pretrained():

    #glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"]
    glove_vec = ["glove_wiki_300"]
    #glove_vec = ["glove_wiki_50"]
    filename = 'glove_pretrained.h5'
    #import tensorflow as tf
    #sess = tf.InteractiveSession()

    features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename)
    filename = 'glove.h5'
    features = normalize(np.array(features), axis=1, norm='l2')
    with h5py.File(root + glove_vec_fold + filename, "w") as hf:
        hf.create_dataset(glove_vec[0], data=features)
        string_dt = h5py.special_dtype(vlen=str)
        hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt)

    for vec in glove_vec:
        data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5")
        print(data.shape, words.shape)
        time.sleep(5)
项目:Word2Vec    作者:hashbangCoder    | 项目源码 | 文件源码
def testWord2Vec(testWords,weights,num_display=3):
    ##Generate inverse word mapping for easy lookup
    invWordDict = {v: k for k, v in wordDict.iteritems()}

    ## Normalize the trained weights for cosine similarity
    trainedWeights = normalize(weights,norm = 'l2', axis = 1)
    for word in testWords:
        try:
            embedding = trainedWeights[wordDict[word],:]
            prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist()       
            prox.reverse()
            print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox]

        except KeyError:
            print '"%s" not found in the Trained Word Embeddings. Skipping...'%word
            pass
项目:Word2Vec    作者:hashbangCoder    | 项目源码 | 文件源码
def testWord2Vec(word_list,weights,num_display=3):
    ##Generate inverse word mapping for easy lookup
    invWordDict = {v: k for k, v in wordDict.iteritems()}

    ## Normalize the trained weights for cosine similarity
    trainedWeights = normalize(weights,norm = 'l2', axis = 1)
    for word in word_list:
        try:
            embedding = trainedWeights[wordDict[word],:]
            prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist()       
            prox.reverse()
            print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox]

        except KeyError:
            print '"%s" not found in the Trained Word Embeddings. Skipping...'%word
            pass
项目:blcf    作者:willard-yuan    | 项目源码 | 文件源码
def trainingPCA(features, n_components=256, whiten=True, pca_model_name=None):
    print 'loaded features! {}'.format(features.shape)
    print np.sqrt(sum(features[0,:]**2))

    #print 'Features l2 normalization'
    #features = normalize(features)
    #print np.sqrt(sum(features[0,:]**2))

    print 'Feature PCA-whitenning'
    pca_model = PCA(n_components=n_components, whiten=whiten)
    features = pca_model.fit_transform(features)
    print np.sqrt(sum(features[0,:]**2))

    print 'Features l2 normalization'
    features = normalize(features)
    print np.sqrt(sum(features[0,:]**2))

    if pca_model_name is not None:
        print 'saving model...'
        check_path_file(pca_model_name, create_if_missing=True)
        save_obj(pca_model, pca_model_name)

    print 'done! {}'.format(pca_model_name)

    return pca_model
项目:ProGENI    作者:KnowEnG    | 项目源码 | 文件源码
def gen_network_matrix(num_nodes, net_df, node1, node2, weight, node2index):
    """Generates network adjacency matrix and normalizes it"""
    # Transform the first two columns of the DataFrame -- the nodes -- to their indexes
    net_df[node1] = net_df[node1].apply(lambda x: node2index[x])
    net_df[node2] = net_df[node2].apply(lambda x: node2index[x])
    # Create the sparse matrix
    network_matrix = sparse.csr_matrix((net_df[weight].values, (net_df[node1].values, net_df[node2].values)),
                                       shape=(num_nodes, num_nodes), dtype=float)
    # Make the ajdacency matrix symmetric
    network_matrix = (network_matrix + network_matrix.T)
    network_matrix.setdiag(0)
    # Normalize the rows of network_matrix because we are multiplying vector by matrix (from left)
    network_matrix = normalize(network_matrix, norm='l1', axis=1)
    return(net_df, network_matrix)



###############################################################################
项目:ProGENI    作者:KnowEnG    | 项目源码 | 文件源码
def gen_network_matrix(num_nodes, net_df, node1, node2, weight, node2index):
    """Generates network adjacency matrix and normalizes it"""
    # Transform the first two columns of the DataFrame -- the nodes -- to their indexes
    net_df[node1] = net_df[node1].apply(lambda x: node2index[x])
    net_df[node2] = net_df[node2].apply(lambda x: node2index[x])
    # Create the sparse matrix
    network_matrix = sparse.csr_matrix((net_df[weight].values, (net_df[node1].values, net_df[node2].values)),
                                       shape=(num_nodes, num_nodes), dtype=float)
    # Make the ajdacency matrix symmetric
    network_matrix = (network_matrix + network_matrix.T)
    network_matrix.setdiag(0)
    # Normalize the rows of network_matrix because we are multiplying vector by matrix (from left)
    network_matrix = normalize(network_matrix, norm='l1', axis=1)
    return(net_df, network_matrix)



###############################################################################
项目:TemporalNetworkEpidemics    作者:andreaskoher    | 项目源码 | 文件源码
def get_Temporal_Network(edges,firstday,lastday,directed,number_of_nodes,normalized):
    # Dictionary indexed by times from 0 to firstday-lastday: time: edge_list
    time_to_edges = {t: set() for t in xrange(0, lastday-firstday+1)}
    for u,v,t in edges:
        if u != v: # ignore self loops
            time_to_edges[t - firstday].add((u,v))
            if not directed:
                time_to_edges[t - firstday].add((v,u))
    # Initialize the temporal network
    Temporal_Network = {}
    for time, edges in time_to_edges.items():
        col = [u for u,v in edges]
        row = [v for u,v in edges]
        dat = [True for i in range(len(edges))]

        Adj_Matrix = sp.csr_matrix((dat,(row,col)),
                shape=(number_of_nodes, number_of_nodes), dtype=bool)
        # !!!!!!!!! Annahme, dass Kante: u -> v und p(t+1) = Ap(t) bzw. A[v,u] = 1 !!!!!!!!
        if normalized:
            Adj_Matrix = normalize(Adj_Matrix.transpose(), norm='l1', axis=1, copy=False).transpose()
            Temporal_Network[time] = Adj_Matrix
        else:
            Temporal_Network[time] = Adj_Matrix
    return Temporal_Network
项目:WordEmbedding    作者:ziliwang    | 项目源码 | 文件源码
def main(test, base, align, project, r):
    outdir = os.path.join(os.getcwd(), project)
    tmp_dir = os.path.join(outdir, 'tmp.{}'.format(project))
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    print('temporary dir: {}'.format(tmp_dir))
    basedWordVectors, testedWordVectors, aligned_test, subsetTest = \
        align_vec(base, test, align, tmp_dir)
    test_cols = len(testedWordVectors)
    base_cols = len(basedWordVectors)
    print('normalizing matrix')
    baseX = preprocessing.normalize(dict_to_matrix(basedWordVectors))
    testX = preprocessing.normalize(dict_to_matrix(testedWordVectors))
    aligned_testX = preprocessing.normalize(dict_to_matrix(aligned_test))
    subtestX = preprocessing.normalize(dict_to_matrix(subsetTest))
    cca = CCA(n_components=200)
    print('computing CCA')
    cca.fit(subtestX, aligned_testX)
    ccaed_test = trans(testX, cca.x_weights_)
    ccaed_base = trans(baseX, cca.y_weights_)
    output(outdir, test, ccaed_test, testedWordVectors)
    output(outdir, base, ccaed_base, basedWordVectors)
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def _char_wb_ngrams(self, text_document):
        """Whitespace sensitive char-n-gram tokenization.

        Tokenize text_document into a sequence of character n-grams
        excluding any whitespace (operating only inside word boundaries)"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        min_n, max_n = self.ngram_range
        ngrams = []
        for w in text_document.split():
            w = ' ' + w + ' '
            w_len = len(w)
            for n in xrange(min_n, max_n + 1):
                offset = 0
                ngrams.append(w[offset:offset + n])
                while offset + n < w_len:
                    offset += 1
                    ngrams.append(w[offset:offset + n])
                if offset == 0:  # count a short word (w_len < n) only once
                    break
        return ngrams
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def pool(biz_dict, vlad_dict, mode):
    if mode == 'train':
        y_dict = read_y()
    y = np.zeros((0, 9))
    x = np.array([])
    x_vlad = np.array([])

    for key, value in sorted(biz_dict.items()):
        avg = np.array(value).sum(axis=0) / len(value)
        vlad = vlad_dict.get(key)
#        vlad = preprocessing.normalize(vlad)
#        print(vlad.shape)
#        feat = np.concatenate([avg, vlad], axis=0)
#        feat = preprocessing.Normalizer().fit_transform(feat)
#        feat = avg
        x = np.vstack((x, avg)) if x.size else avg
        x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad

        if mode == 'train':
            y = np.vstack((y, y_dict.get(key)))        
    return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
项目:wi_wacv14    作者:VChristlein    | 项目源码 | 文件源码
def normalizeEnc(enc, method):
    """
    normalize encoding w. global normalization scheme(s)

    parameters:
        enc: the encoding vector to normalize
        method:
            'ssr': signed square root
            'l2g': global l2 normalization
    """
    # ssr-normalization (kinda hellinger-normalization)
    if 'ssr' in method:
        enc = np.sign(enc) * np.sqrt(np.abs(enc))

    if 'l2g' in method:
        enc = preprocessing.normalize(enc)

    return enc
项目:wi_wacv14    作者:VChristlein    | 项目源码 | 文件源码
def vlad(data, means, assignments, components, 
               normalize=['l2c']):
    """
    compute 'vector of locally aggregated descriptors'
    """
    def encode(k):
        uk_ = assignments[:,k].T.dot(data)        

        clustermass = assignments[:,k].sum()
        if clustermass > 0:
            uk_ -= clustermass * means[k]

        if 'l2c' in normalize:
            n = max(math.sqrt(np.sum(uk_ * uk_)), 1e-12)
            uk_ /= n

        return uk_

    uk = map(encode, range(components))

    uk = np.concatenate(uk, axis=0).reshape(1,-1)

    return uk
项目:sentence-classification    作者:bgmartins    | 项目源码 | 文件源码
def predict(self, X):
        """Predict the class labels for the provided data
        Parameters
        ----------
        X : scipy.sparse matrix, shape (n_test_samples, vocab_size)
            Test samples.

        Returns
        -------
        y : array of shape [n_samples]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)
        dist = self._pairwise_wmd(sp.sparse.csr_matrix(X))
        return super(WordMoversKNN, self).predict(dist)
项目:FlapPyBio    作者:michael-iuzzolino    | 项目源码 | 文件源码
def feed_forward(self, X):
        X = np.asarray(X)
        for index, (matrix, b) in enumerate(zip(self.W[:-1], self.b)):

            size_output = self.topology[index+1]

            if index == 0:

                X = normalize(X[:,np.newaxis], axis=0).ravel()
                dot_ = np.dot(matrix, X)

            else:
                dot_ = np.dot(matrix, output)

            output = self._activation_(dot_ + b, size_output)

        self.output = output[0]
项目:neural_topic_models    作者:dallascard    | 项目源码 | 文件源码
def save_mean_representations(model, model_filename, X, labels, pred_file):
    n_items, dv = X.shape
    n_classes = model.n_classes
    n_topics = model.d_t

    # try normalizing input vectors
    test_X = normalize(np.array(X, dtype='float32'), axis=1)

    model.load_params(model_filename)

    # evaluate bound on test set
    item_mus = []
    for item in range(n_items):
        y = labels[item]

        # save the mean document representation
        r_mu = model.get_mean_doc_rep(test_X[item, :], y)
        item_mus.append(np.array(r_mu))

    # write all the test doc representations to file
    if pred_file is not None and n_topics > 1:
        np.savez_compressed(pred_file, X=np.array(item_mus), y=labels)
项目:nn-segmentation-for-lar    作者:cvdlab    | 项目源码 | 文件源码
def predict_image(self, test_img):
        """
        predicts classes of input image
        :param test_img: filepath to image to predict on
        :param show: displays segmentation results
        :return: segmented result
        """
        img = np.array( rgb2gray( imread( test_img ).astype( 'float' ) ).reshape( 5, 216, 160 )[-2] ) / 256

        plist = []

        # create patches from an entire slice
        img_1 = adjust_sigmoid( img ).astype( float )
        edges_1 = adjust_sigmoid( img, inv=True ).astype( float )
        edges_2 = img_1
        edges_5_n = normalize( laplace( img_1 ) )
        edges_5_n = img_as_float( img_as_ubyte( edges_5_n ) )

        plist.append( extract_patches_2d( edges_1, (23, 23) ) )
        plist.append( extract_patches_2d( edges_2, (23, 23) ) )
        plist.append( extract_patches_2d( edges_5_n, (23, 23) ) )
        patches = np.array( zip( np.array( plist[0] ), np.array( plist[1] ), np.array( plist[2] ) ) )

        # predict classes of each pixel based on model
        full_pred = self.model.predict_classes( patches )
        fp1 = full_pred.reshape( 194, 138 )
        return fp1
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
        """ Label? ??? ??? ??? ??? ??? Row ??? ????.
        Args:
          params:
            * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
            * _df_csv_read_ori : pandas dataframe
            * _label
        Returns:
          Preprocessing Dataframe
        """
        if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
            logging.info("No Duplicate")
            result_df =  _df_csv_read_ori
        else :
            cell_features = _df_csv_read_ori.columns.tolist()
            cell_features.remove(_label)
            result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
            logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
            temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
            result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
        return result_df
项目:CerebralCortex-2.0-legacy    作者:MD2Korg    | 项目源码 | 文件源码
def normalize(datastream: DataStream) -> DataStream:
    """

    :param datastream:
    :return:
    """
    result = DataStream.from_datastream(input_streams=[datastream])
    if datastream.data is None or len(datastream.data) == 0:
        result.data = []
        return result

    input_data = np.array([i.sample for i in datastream.data])

    data = preprocessing.normalize(input_data, axis=0)

    result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i])
                   for i, v in enumerate(datastream.data)]

    return result
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='none':
            self.data=self.orig_data
        elif method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:cluster_paraphrases    作者:acocos    | 项目源码 | 文件源码
def get_sils_matrix(method, scores, wordlist):
    ''' See get_sims_matrix for definitions, which are the same here. The
    difference is that the resulting matrix contains distances instead of
    similarities.

    :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist)
    '''
    if method =='direct':
        sims = get_sims_matrix(method, scores, wordlist)
        sims = preprocessing.normalize(np.matrix(sims), norm='l2')
        sils = 1-sims
    elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix
        sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist])
    elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix
        sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist])
    elif method == 'vec_cosine':
        d = scores.values()[0].shape[0]
        sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist])
    else:
        sys.stderr.write('Unknown sil method: %s' % method)
        return None
    sils = np.nan_to_num(sils)
    return sils
项目:spherecluster    作者:clara-labs    | 项目源码 | 文件源码
def __init__(self, n_clusters=5, posterior_type='soft', force_weights=None,
                 n_init=10, n_jobs=1, max_iter=300, verbose=False,
                 init='random-class', random_state=None, tol=1e-6,
                 copy_x=True, normalize=True):
        self.n_clusters = n_clusters
        self.posterior_type = posterior_type
        self.force_weights = force_weights
        self.n_init = n_init
        self.n_jobs = n_jobs
        self.max_iter = max_iter
        self.verbose = verbose
        self.init = init
        self.random_state = random_state
        self.tol = tol
        self.copy_x = copy_x
        self.normalize = normalize

        # results from algorithm
        self.cluster_centers_ = None
        self.labels = None
        self.intertia_ = None
        self.weights_ = None
        self.concentrations_ = None
        self.posterior_ = None
项目:spherecluster    作者:clara-labs    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Compute mixture of von Mises Fisher clustering.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """
        if self.normalize:
            X = normalize(X)

        self._check_force_weights()
        random_state = check_random_state(self.random_state)
        X = self._check_fit_data(X)

        (self.cluster_centers_, self.labels_, self.inertia_, self.weights_,
         self.concentrations_, self.posterior_) = movMF(
                X, self.n_clusters, posterior_type=self.posterior_type,
                force_weights=self.force_weights, n_init=self.n_init,
                n_jobs=self.n_jobs, max_iter=self.max_iter,
                verbose=self.verbose, init=self.init,
                random_state=random_state,
                tol=self.tol, copy_x=self.copy_x
            )

        return self
项目:spherecluster    作者:clara-labs    | 项目源码 | 文件源码
def transform(self, X, y=None):
        """Transform X to a cluster-distance space.
        In the new space, each dimension is the cosine distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------
        X_new : array, shape [n_samples, k]
            X transformed in the new space.
        """
        if self.normalize:
            X = normalize(X)

        check_is_fitted(self, 'cluster_centers_')
        X = self._check_test_data(X)
        return self._transform(X)
项目:spherecluster    作者:clara-labs    | 项目源码 | 文件源码
def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Note:  Does not check that each point is on the sphere.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        if self.normalize:
            X = normalize(X)

        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        return _labels_inertia(X, self.cluster_centers_)[0]
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)