我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.normalize()。
def test_homonym(H, sent, features, C=1.0): X_0 = features(matching(sent, H[0])) X_1 = features(matching(sent, H[1])) y_0 = numpy.zeros(len(X_0)) y_1 = numpy.ones(len(X_1)) X = normalize(numpy.vstack([X_0, X_1]), norm='l2') y = numpy.hstack([y_0, y_1]) classifier = LogisticRegression(C=C) fold = StratifiedKFold(y, n_folds=10) score = [] count = [] for tr, te in fold: X_tr, X_te = X[tr], X[te] y_tr, y_te = y[tr], y[te] classifier.fit(X_tr, y_tr) score.append(sum(classifier.predict(X_te) == y_te)) count.append(len(y_te)) score = numpy.array(score, dtype='float') count = numpy.array(count, dtype='float') result = {'word1_count': len(y_0), 'word2_count': len(y_1), 'majority': 1.0 * max(len(y_0),len(y_1))/len(y), 'kfold_acc': score/count } return result
def make_tfidf(arr): '''input, numpy array with flavor counts for each recipe and compounds return numpy array adjusted as tfidf ''' arr2 = arr.copy() N=arr2.shape[0] l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1) l2_rows[l2_rows==0]=1 arr2_norm = arr2/l2_rows arr2_freq = np.sum(arr2_norm>0, axis=0) arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0 from sklearn.preprocessing import normalize tfidf = np.multiply(arr2_norm, arr2_idf) tfidf = normalize(tfidf, norm='l2', axis=1) print tfidf.shape return tfidf
def flavor_profile(df,ingr,comp,ingr_comp): sorted_ingredients = df.columns underscore_ingredients=[] for item in sorted_ingredients: underscore_ingredients.append(item.replace(' ','_')) print len(underscore_ingredients), len(sorted_ingredients) ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id') ingr_total = ingr_total.join(comp,how='right',on='compound id') ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id']) ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)] df_flavor = df.values.dot(ingr_flavor.values) print df.shape, df_flavor.shape return df_flavor #normalize flavor matrix with tfidf method
def __init__(self, path, words=[], dim=300, normalize=True, **kwargs): seen = [] vs = {} for line in open(path): split = line.split() w = split[0] if words == [] or w in words: if len(split) != dim+1: continue seen.append(w) vs[w] = np.array(map(float, split[1:]), dtype='float32') self.iw = seen self.wi = {w:i for i,w in enumerate(self.iw)} self.m = np.vstack(vs[w] for w in self.iw) if normalize: self.normalize()
def get_subembed(self, word_list, normalize=False, restrict_context=True): """ Gets subembedding. """ w_set = set(self.iw) valid_w = [word for word in word_list if word in w_set] new_w_indices = np.array([self.wi[word] for word in valid_w]) if restrict_context: c_set = set(self.ic) valid_c = [word for word in word_list if word in c_set] new_c_indices = np.array([self.ci[word] for word in valid_c]) new_m = self.m[new_w_indices, :] new_m = new_m[:, new_c_indices] else: valid_c = self.ic new_m = self.m[new_w_indices, :] return Explicit(new_m, valid_w, valid_c, normalize=normalize)
def get_local_words(preds, vocab, NEs=[], k=50): """ given the word probabilities over many coordinates, first normalize the probability of each word in different locations to get a probability distribution, then compute the entropy of the word's distribution over all coordinates and return the words that are low entropy and are not named entities. """ #normalize the probabilites of each vocab using entropy normalized_preds = normalize(preds, norm='l1', axis=0) entropies = stats.entropy(normalized_preds) sorted_indices = np.argsort(entropies) sorted_local_words = np.array(vocab)[sorted_indices].tolist() filtered_local_words = [] NEset = set(NEs) for word in sorted_local_words: if word in NEset: continue filtered_local_words.append(word) return filtered_local_words[0:k]
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace C = 0.5*(C + C.T) r = d*K + 1 U, S, _ = svds(C,r,v0 = np.ones(C.shape[0])) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace C = 0.5*(C + C.T) r = min(d*K + 1, C.shape[0]-1) U, S, _ = svds(C,r,v0 = np.ones(C.shape[0])) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed',assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def post_proC(C, K, d, alpha): # C: coefficient matrix, K: number of clusters, d: dimension of each subspace n = C.shape[0] C = 0.5*(C + C.T) C = C - np.diag(np.diag(C)) + np.eye(n,n) # for sparse C, this step will make the algorithm more numerically stable r = d*K + 1 U, S, _ = svds(C,r,v0 = np.ones(n)) U = U[:,::-1] S = np.sqrt(S[::-1]) S = np.diag(S) U = U.dot(S) U = normalize(U, norm='l2', axis = 1) Z = U.dot(U.T) Z = Z * (Z>0) L = np.abs(Z ** alpha) L = L/L.max() L = 0.5 * (L + L.T) spectral = cluster.SpectralClustering(n_clusters=K, eigen_solver='arpack', affinity='precomputed', assign_labels='discretize') spectral.fit(L) grp = spectral.fit_predict(L) + 1 return grp, L
def rede_neural(X, y): print("Iniciando treinamento da Rede Neural") X2 = normalize(X) clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5, learning_rate='constant',tol=1e-8,learning_rate_init=0.0002, early_stopping=True,validation_fraction=0.2) kf = KFold(len(y),n_folds=3) i = 0 for train,test in kf: start = time.time() i = i + 1 print("Treinamento",i) # dividindo dataset em treino e test #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1) X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test] # fit clf.fit(X_train, y_train) print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )") return clf
def vectorize(features, vocab): """ Transform a features list into a numeric vector with a given vocab :type dpvocab: dict :param dpvocab: vocab for distributional representation :type projmat: scipy.lil_matrix :param projmat: projection matrix for disrep """ vec = lil_matrix((1, len(vocab))) for feat in features: try: fidx = vocab[feat] vec[0, fidx] += 1.0 except KeyError: pass # Normalization vec = normalize(vec) return vec
def strip_accents_unicode(s): """Transform accentuated unicode symbols into their simple counterpart Warning: the python-level loop and join operations make this implementation 20 times slower than the strip_accents_ascii basic normalization. See also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct ASCII equivalent. """ normalized = unicodedata.normalize('NFKD', s) if normalized == s: return s else: return ''.join([c for c in normalized if not unicodedata.combining(c)])
def _char_wb_ngrams(self, text_document): """Whitespace sensitive char-n-gram tokenization. Tokenize text_document into a sequence of character n-grams excluding any whitespace (operating only inside word boundaries)""" # normalize white spaces text_document = self._white_spaces.sub(" ", text_document) min_n, max_n = self.ngram_range ngrams = [] for w in text_document.split(): w = ' ' + w + ' ' w_len = len(w) for n in xrange(min_n, max_n + 1): offset = 0 ngrams.append(w[offset:offset + n]) while offset + n < w_len: offset += 1 ngrams.append(w[offset:offset + n]) if offset == 0: # count a short word (w_len < n) only once break return ngrams
def fit(self, X_raw, y=None): cents = self.vect.fit_transform(X_raw) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) # cents = all_but_the_top(cents, 1) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) # print("Renormalizing") # normalize(cents, copy=False) # print("Largest singular value: {:.2f}".format( # np.linalg.norm(cents, ord=2))) self.centroids = cents print(' FIT centroids shape', self.centroids.shape) self._y = y if self.matching: self.matching.fit(X_raw) else: self.nn.fit(cents)
def test_lsi(): cache_dir = check_cache() n_components = 2 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir, file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0) assert lsi_res.components_.shape[0] == 5 assert lsi_res.components_.shape[1] == fe.n_features_ assert lsi._load_pars() is not None lsi._load_model() X_lsi = lsi._load_features() assert_allclose(normalize(X_lsi), X_lsi) lsi.list_models() lsi.delete()
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized fe.delete()
def test_feature_extraction_weighting(weighting, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(weighting=weighting, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \ "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized fe.delete()
def load_pretrained(): #glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"] glove_vec = ["glove_wiki_300"] #glove_vec = ["glove_wiki_50"] filename = 'glove_pretrained.h5' #import tensorflow as tf #sess = tf.InteractiveSession() features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename) filename = 'glove.h5' features = normalize(np.array(features), axis=1, norm='l2') with h5py.File(root + glove_vec_fold + filename, "w") as hf: hf.create_dataset(glove_vec[0], data=features) string_dt = h5py.special_dtype(vlen=str) hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt) for vec in glove_vec: data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5") print(data.shape, words.shape) time.sleep(5)
def testWord2Vec(testWords,weights,num_display=3): ##Generate inverse word mapping for easy lookup invWordDict = {v: k for k, v in wordDict.iteritems()} ## Normalize the trained weights for cosine similarity trainedWeights = normalize(weights,norm = 'l2', axis = 1) for word in testWords: try: embedding = trainedWeights[wordDict[word],:] prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist() prox.reverse() print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox] except KeyError: print '"%s" not found in the Trained Word Embeddings. Skipping...'%word pass
def testWord2Vec(word_list,weights,num_display=3): ##Generate inverse word mapping for easy lookup invWordDict = {v: k for k, v in wordDict.iteritems()} ## Normalize the trained weights for cosine similarity trainedWeights = normalize(weights,norm = 'l2', axis = 1) for word in word_list: try: embedding = trainedWeights[wordDict[word],:] prox = np.argsort(np.dot(embedding,trainedWeights.transpose())/np.linalg.norm(embedding))[-num_display:].tolist() prox.reverse() print 'Closest word vector (by cosine similarity) for %s : '%word, [invWordDict[item] for item in prox] except KeyError: print '"%s" not found in the Trained Word Embeddings. Skipping...'%word pass
def trainingPCA(features, n_components=256, whiten=True, pca_model_name=None): print 'loaded features! {}'.format(features.shape) print np.sqrt(sum(features[0,:]**2)) #print 'Features l2 normalization' #features = normalize(features) #print np.sqrt(sum(features[0,:]**2)) print 'Feature PCA-whitenning' pca_model = PCA(n_components=n_components, whiten=whiten) features = pca_model.fit_transform(features) print np.sqrt(sum(features[0,:]**2)) print 'Features l2 normalization' features = normalize(features) print np.sqrt(sum(features[0,:]**2)) if pca_model_name is not None: print 'saving model...' check_path_file(pca_model_name, create_if_missing=True) save_obj(pca_model, pca_model_name) print 'done! {}'.format(pca_model_name) return pca_model
def gen_network_matrix(num_nodes, net_df, node1, node2, weight, node2index): """Generates network adjacency matrix and normalizes it""" # Transform the first two columns of the DataFrame -- the nodes -- to their indexes net_df[node1] = net_df[node1].apply(lambda x: node2index[x]) net_df[node2] = net_df[node2].apply(lambda x: node2index[x]) # Create the sparse matrix network_matrix = sparse.csr_matrix((net_df[weight].values, (net_df[node1].values, net_df[node2].values)), shape=(num_nodes, num_nodes), dtype=float) # Make the ajdacency matrix symmetric network_matrix = (network_matrix + network_matrix.T) network_matrix.setdiag(0) # Normalize the rows of network_matrix because we are multiplying vector by matrix (from left) network_matrix = normalize(network_matrix, norm='l1', axis=1) return(net_df, network_matrix) ###############################################################################
def get_Temporal_Network(edges,firstday,lastday,directed,number_of_nodes,normalized): # Dictionary indexed by times from 0 to firstday-lastday: time: edge_list time_to_edges = {t: set() for t in xrange(0, lastday-firstday+1)} for u,v,t in edges: if u != v: # ignore self loops time_to_edges[t - firstday].add((u,v)) if not directed: time_to_edges[t - firstday].add((v,u)) # Initialize the temporal network Temporal_Network = {} for time, edges in time_to_edges.items(): col = [u for u,v in edges] row = [v for u,v in edges] dat = [True for i in range(len(edges))] Adj_Matrix = sp.csr_matrix((dat,(row,col)), shape=(number_of_nodes, number_of_nodes), dtype=bool) # !!!!!!!!! Annahme, dass Kante: u -> v und p(t+1) = Ap(t) bzw. A[v,u] = 1 !!!!!!!! if normalized: Adj_Matrix = normalize(Adj_Matrix.transpose(), norm='l1', axis=1, copy=False).transpose() Temporal_Network[time] = Adj_Matrix else: Temporal_Network[time] = Adj_Matrix return Temporal_Network
def main(test, base, align, project, r): outdir = os.path.join(os.getcwd(), project) tmp_dir = os.path.join(outdir, 'tmp.{}'.format(project)) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) print('temporary dir: {}'.format(tmp_dir)) basedWordVectors, testedWordVectors, aligned_test, subsetTest = \ align_vec(base, test, align, tmp_dir) test_cols = len(testedWordVectors) base_cols = len(basedWordVectors) print('normalizing matrix') baseX = preprocessing.normalize(dict_to_matrix(basedWordVectors)) testX = preprocessing.normalize(dict_to_matrix(testedWordVectors)) aligned_testX = preprocessing.normalize(dict_to_matrix(aligned_test)) subtestX = preprocessing.normalize(dict_to_matrix(subsetTest)) cca = CCA(n_components=200) print('computing CCA') cca.fit(subtestX, aligned_testX) ccaed_test = trans(testX, cca.x_weights_) ccaed_base = trans(baseX, cca.y_weights_) output(outdir, test, ccaed_test, testedWordVectors) output(outdir, base, ccaed_base, basedWordVectors)
def pool(biz_dict, vlad_dict, mode): if mode == 'train': y_dict = read_y() y = np.zeros((0, 9)) x = np.array([]) x_vlad = np.array([]) for key, value in sorted(biz_dict.items()): avg = np.array(value).sum(axis=0) / len(value) vlad = vlad_dict.get(key) # vlad = preprocessing.normalize(vlad) # print(vlad.shape) # feat = np.concatenate([avg, vlad], axis=0) # feat = preprocessing.Normalizer().fit_transform(feat) # feat = avg x = np.vstack((x, avg)) if x.size else avg x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad if mode == 'train': y = np.vstack((y, y_dict.get(key))) return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
def normalizeEnc(enc, method): """ normalize encoding w. global normalization scheme(s) parameters: enc: the encoding vector to normalize method: 'ssr': signed square root 'l2g': global l2 normalization """ # ssr-normalization (kinda hellinger-normalization) if 'ssr' in method: enc = np.sign(enc) * np.sqrt(np.abs(enc)) if 'l2g' in method: enc = preprocessing.normalize(enc) return enc
def vlad(data, means, assignments, components, normalize=['l2c']): """ compute 'vector of locally aggregated descriptors' """ def encode(k): uk_ = assignments[:,k].T.dot(data) clustermass = assignments[:,k].sum() if clustermass > 0: uk_ -= clustermass * means[k] if 'l2c' in normalize: n = max(math.sqrt(np.sum(uk_ * uk_)), 1e-12) uk_ /= n return uk_ uk = map(encode, range(components)) uk = np.concatenate(uk, axis=0).reshape(1,-1) return uk
def predict(self, X): """Predict the class labels for the provided data Parameters ---------- X : scipy.sparse matrix, shape (n_test_samples, vocab_size) Test samples. Returns ------- y : array of shape [n_samples] Class labels for each data sample. """ X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) dist = self._pairwise_wmd(sp.sparse.csr_matrix(X)) return super(WordMoversKNN, self).predict(dist)
def feed_forward(self, X): X = np.asarray(X) for index, (matrix, b) in enumerate(zip(self.W[:-1], self.b)): size_output = self.topology[index+1] if index == 0: X = normalize(X[:,np.newaxis], axis=0).ravel() dot_ = np.dot(matrix, X) else: dot_ = np.dot(matrix, output) output = self._activation_(dot_ + b, size_output) self.output = output[0]
def save_mean_representations(model, model_filename, X, labels, pred_file): n_items, dv = X.shape n_classes = model.n_classes n_topics = model.d_t # try normalizing input vectors test_X = normalize(np.array(X, dtype='float32'), axis=1) model.load_params(model_filename) # evaluate bound on test set item_mus = [] for item in range(n_items): y = labels[item] # save the mean document representation r_mu = model.get_mean_doc_rep(test_X[item, :], y) item_mus.append(np.array(r_mu)) # write all the test doc representations to file if pred_file is not None and n_topics > 1: np.savez_compressed(pred_file, X=np.array(item_mus), y=labels)
def predict_image(self, test_img): """ predicts classes of input image :param test_img: filepath to image to predict on :param show: displays segmentation results :return: segmented result """ img = np.array( rgb2gray( imread( test_img ).astype( 'float' ) ).reshape( 5, 216, 160 )[-2] ) / 256 plist = [] # create patches from an entire slice img_1 = adjust_sigmoid( img ).astype( float ) edges_1 = adjust_sigmoid( img, inv=True ).astype( float ) edges_2 = img_1 edges_5_n = normalize( laplace( img_1 ) ) edges_5_n = img_as_float( img_as_ubyte( edges_5_n ) ) plist.append( extract_patches_2d( edges_1, (23, 23) ) ) plist.append( extract_patches_2d( edges_2, (23, 23) ) ) plist.append( extract_patches_2d( edges_5_n, (23, 23) ) ) patches = np.array( zip( np.array( plist[0] ), np.array( plist[1] ), np.array( plist[2] ) ) ) # predict classes of each pixel based on model full_pred = self.model.predict_classes( patches ) fp1 = full_pred.reshape( 194, 138 ) return fp1
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label): """ Label? ??? ??? ??? ??? ??? Row ??? ????. Args: params: * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] * _df_csv_read_ori : pandas dataframe * _label Returns: Preprocessing Dataframe """ if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False: logging.info("No Duplicate") result_df = _df_csv_read_ori else : cell_features = _df_csv_read_ori.columns.tolist() cell_features.remove(_label) result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first") logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index))) temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk" result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename) return result_df
def normalize(datastream: DataStream) -> DataStream: """ :param datastream: :return: """ result = DataStream.from_datastream(input_streams=[datastream]) if datastream.data is None or len(datastream.data) == 0: result.data = [] return result input_data = np.array([i.sample for i in datastream.data]) data = preprocessing.normalize(input_data, axis=0) result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i]) for i, v in enumerate(datastream.data)] return result
def compute_preprocessor(self,method): self.data={} if method=='none': self.data=self.orig_data elif method=='min_max': transform=preprocessing.MinMaxScaler() self.data['X_train']=transform.fit_transform(self.orig_data['X_train']) self.data['X_val']=transform.transform(self.orig_data['X_val']) self.data['X_test']=transform.transform(self.orig_data['X_test']) elif method=='scaled': self.data['X_train']=preprocessing.scale(self.orig_data['X_train']) self.data['X_val']=preprocessing.scale(self.orig_data['X_val']) self.data['X_test']=preprocessing.scale(self.orig_data['X_test']) elif method=='normalized': self.data['X_train']=preprocessing.normalize(self.orig_data['X_train']) self.data['X_val']=preprocessing.normalize(self.orig_data['X_val']) self.data['X_test']=preprocessing.normalize(self.orig_data['X_test']) self.data['y_train']=self.orig_data['y_train'] self.data['y_val']=self.orig_data['y_val'] self.data['y_test']=self.orig_data['y_test']
def compute_preprocessor(self,method): self.data={} if method=='min_max': transform=preprocessing.MinMaxScaler() self.data['X_train']=transform.fit_transform(self.orig_data['X_train']) self.data['X_val']=transform.transform(self.orig_data['X_val']) self.data['X_test']=transform.transform(self.orig_data['X_test']) elif method=='scaled': self.data['X_train']=preprocessing.scale(self.orig_data['X_train']) self.data['X_val']=preprocessing.scale(self.orig_data['X_val']) self.data['X_test']=preprocessing.scale(self.orig_data['X_test']) elif method=='normalized': self.data['X_train']=preprocessing.normalize(self.orig_data['X_train']) self.data['X_val']=preprocessing.normalize(self.orig_data['X_val']) self.data['X_test']=preprocessing.normalize(self.orig_data['X_test']) self.data['y_train']=self.orig_data['y_train'] self.data['y_val']=self.orig_data['y_val'] self.data['y_test']=self.orig_data['y_test']
def get_sils_matrix(method, scores, wordlist): ''' See get_sims_matrix for definitions, which are the same here. The difference is that the resulting matrix contains distances instead of similarities. :return: 2-dimensional np.ndarray of size len(wordlist) x len(wordlist) ''' if method =='direct': sims = get_sims_matrix(method, scores, wordlist) sims = preprocessing.normalize(np.matrix(sims), norm='l2') sils = 1-sims elif method == 'dict_cosine': # cosine dist of word-PPDB2.0Score matrix sils = np.array([[dict_cosine_dist(scores.get(i,{}),scores.get(j,{})) for j in wordlist] for i in wordlist]) elif method == 'dict_JS': # JS divergence of word-PPDB2.0Score matrix sils = np.array([[dict_js_divergence(scores.get(i,{}),scores.get(j,{}))[0] for j in wordlist] for i in wordlist]) elif method == 'vec_cosine': d = scores.values()[0].shape[0] sils = np.array([[cosine(scores.get(i,np.zeros(d)), scores.get(j,np.zeros(d))) for j in wordlist] for i in wordlist]) else: sys.stderr.write('Unknown sil method: %s' % method) return None sils = np.nan_to_num(sils) return sils
def __init__(self, n_clusters=5, posterior_type='soft', force_weights=None, n_init=10, n_jobs=1, max_iter=300, verbose=False, init='random-class', random_state=None, tol=1e-6, copy_x=True, normalize=True): self.n_clusters = n_clusters self.posterior_type = posterior_type self.force_weights = force_weights self.n_init = n_init self.n_jobs = n_jobs self.max_iter = max_iter self.verbose = verbose self.init = init self.random_state = random_state self.tol = tol self.copy_x = copy_x self.normalize = normalize # results from algorithm self.cluster_centers_ = None self.labels = None self.intertia_ = None self.weights_ = None self.concentrations_ = None self.posterior_ = None
def fit(self, X, y=None): """Compute mixture of von Mises Fisher clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) """ if self.normalize: X = normalize(X) self._check_force_weights() random_state = check_random_state(self.random_state) X = self._check_fit_data(X) (self.cluster_centers_, self.labels_, self.inertia_, self.weights_, self.concentrations_, self.posterior_) = movMF( X, self.n_clusters, posterior_type=self.posterior_type, force_weights=self.force_weights, n_init=self.n_init, n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, init=self.init, random_state=random_state, tol=self.tol, copy_x=self.copy_x ) return self
def transform(self, X, y=None): """Transform X to a cluster-distance space. In the new space, each dimension is the cosine distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. Returns ------- X_new : array, shape [n_samples, k] X transformed in the new space. """ if self.normalize: X = normalize(X) check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) return self._transform(X)
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Note: Does not check that each point is on the sphere. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ if self.normalize: X = normalize(X) check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) return _labels_inertia(X, self.cluster_centers_)[0]
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)