我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用sklearn.preprocessing.Normalizer()。
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)), ("Normalizer", Normalizer())]) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def word_unigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) vectorizer = CountVectorizer(min_df=2, stop_words=get_stopwords(), preprocessor=preprocessor, ngram_range=(1, 1)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_unigrams', pipeline)
def decompose(doc_vecs, n_features=100, normalize=False, flip=False): svd = TruncatedSVD(n_features) if normalize: if flip: lsa = make_pipeline(svd, Normalizer(copy=False)) doc_mat = lsa.fit_transform(doc_vecs.transpose()) doc_mat = doc_mat.transpose() else: lsa = make_pipeline(svd, Normalizer(copy=False)) doc_mat = lsa.fit_transform(doc_vecs) return doc_mat else: if flip: doc_mat = svd.fit_transform(doc_vecs.transpose()) doc_mat = doc_mat.transpose() else: doc_mat = svd.fit_transform(doc_vecs) return doc_mat
def train(labeled_featuresets, C=1e5): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ feat = [featureset for featureset, label in labeled_featuresets] feature_vectorizer = MVectorizer.DictsVectorizer() X = feature_vectorizer.fit_transform(feat) X = Normalizer().fit_transform(X) label_set = set( [label for featureset, label in labeled_featuresets] ) label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] ) y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets]) # print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]), classifier = OneVsRestClassifier(LinearSVC(loss='squared_hinge', penalty='l2', dual=True, tol=1e-5, C=C)) classifier.fit(X,y) # print "done" return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
def pool(biz_dict, vlad_dict, mode): if mode == 'train': y_dict = read_y() y = np.zeros((0, 9)) x = np.array([]) x_vlad = np.array([]) for key, value in sorted(biz_dict.items()): avg = np.array(value).sum(axis=0) / len(value) vlad = vlad_dict.get(key) # vlad = preprocessing.normalize(vlad) # print(vlad.shape) # feat = np.concatenate([avg, vlad], axis=0) # feat = preprocessing.Normalizer().fit_transform(feat) # feat = avg x = np.vstack((x, avg)) if x.size else avg x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad if mode == 'train': y = np.vstack((y, y_dict.get(key))) return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
def main(): features = [] for i in list: im = cv2.imread(i) hist, bins = np.histogram(im.ravel(), 256, [0, 256]) features.append(hist) lsa = TruncatedSVD(10) features = lsa.fit_transform(features) features = Normalizer(copy = False).fit_transform(features) km = KMeans( init='k-means++', n_clusters=n_clusters, ) km.fit(features) for i in range(n_clusters): if not os.path.exists('./result/' + str(i)): os.makedirs('./result/' + str(i)) cnt = 0 for i in list: filename = i.split('/')[-1] print filename, print km.labels_[cnt] shutil.copyfile(i, './result/' + str(km.labels_[cnt]) + '/' + filename) cnt += 1
def convert(model, input_features, output_features): """Convert a normalizer model to the protobuf spec. Parameters ---------- model: Normalizer A Normalizer. input_features: str Name of the input column. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') # Test the scikit-learn model _sklearn_util.check_expected_type(model, Normalizer) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm')) # Set the interface params. spec = _Model_pb2.Model() spec.specificationVersion = SPECIFICATION_VERSION spec = _set_transform_interface_params(spec, input_features, output_features) # Set the one hot encoder parameters _normalizer_spec = spec.normalizer if model.norm == 'l1': _normalizer_spec.normType = _proto__normalizer.L1 elif model.norm == 'l2': _normalizer_spec.normType = _proto__normalizer.L2 elif model.norm == 'max': _normalizer_spec.normType = _proto__normalizer.LMax return _MLModel(spec)
def truncated_svd(self): # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py svd = TruncatedSVD(self.dimensions) lsa = make_pipeline(svd, Normalizer(copy=False)) X_reduced = lsa.fit_transform(self.bag_of_words_matrix) print(svd.components_[0]) print(svd.explained_variance_ratio_) print(svd.explained_variance_ratio_.sum())
def get_data_preprocessor_rescaling(params): dpr = None d_rescaling = params['layer_dict_list'][0] if params['rescaling'] == str(d_rescaling['None']) or params['rescaling'] == 'None': dpr = None elif params['rescaling'] == str(d_rescaling['MinMax']) or params['rescaling'] == 'MinMax': dpr = MinMaxScaler() elif params['rescaling'] == str(d_rescaling['Standardize']) or params['rescaling'] == 'Standardize': dpr = StandardScaler() elif params['rescaling'] == str(d_rescaling['Normalize']) or params['rescaling'] == 'Normalize': dpr = Normalizer() return dpr
def test_random(self): # Generate some random data_imputeValue.multiArrayValue[i] X = _np.random.random(size = (50, 3)) for param in ('l1', 'l2', 'max'): cur_model= Normalizer(norm=param) output = cur_model.fit_transform(X) spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out') metrics = evaluate_transformer(spec, [dict(zip(["a", "b", "c"], row)) for row in X], [{"out" : row} for row in output])
def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = Normalizer(norm='l2').fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, 'out') input_data = [dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data] output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)] evaluate_transformer(spec, input_data, output_data)
def make_ward_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'WARD/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward') predict_result = ward.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def make_spectral_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'spectral/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) spectral = SpectralClustering(n_clusters=self.spectral_clusters_count) predict_result = spectral.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames) # aa = Affinity Propagation
def make_aa_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'affinity_propagation/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) aa_clusterizator = AffinityPropagation(damping=self.aa_damping, max_iter=self.aa_max_iter, convergence_iter=self.aa_no_change_stop) predict_result = aa_clusterizator.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def make_birch_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'birch/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) birch = Birch(threshold=self.birch_threshold, branching_factor=self.birch_branching_factor, n_clusters=self.birch_clusters_count) predict_result = birch.fit_predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def avg_spelling_error(lang=None): pipeline = Pipeline([('feature', SpellingError(language=lang)), ('tfidf', TfidfTransformer(sublinear_tf=False)), ('scale', Normalizer())]) return ('avg_spelling_error', pipeline)
def punctuation_features(): pipeline = Pipeline([('feature', PunctuationFeatures()), ('tfidf', TfidfTransformer(sublinear_tf=False)), ('scale', Normalizer())]) return ('punctuation_features', pipeline)
def word_bigrams(): preprocessor = TextCleaner(lowercase=True, filter_urls=True, filter_mentions=True, filter_hashtags=True, alphabetic=True, strip_accents=True, filter_rt=True) pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor, ngram_range=(2, 2))), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('word_bigrams', pipeline)
def char_ngrams(): vectorizer = CountVectorizer(min_df=1, preprocessor=TextCleaner(filter_urls=True, filter_mentions=True, filter_hashtags=True, lowercase=False), analyzer='char_wb', ngram_range=(4, 4)) pipeline = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('scale', Normalizer())]) return ('char_ngrams', pipeline)
def doPCA(X, output_columns_count): #DO PCA on the data and use it to transform svd = TruncatedSVD(output_columns_count) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) return X
def normalize(matrix): '''Normalize each row (L2-norm) of a CSR sparse matrix (it should work with most sparse matrices though)''' sparsy = matrix.tocoo() data = [float(d) for d in sparsy.data] return Normalizer().transform(csr_matrix((data, (sparsy.row, sparsy.col)))) # # Simple tests #
def batch_classify(self, featuresets): X = self.feature_vectorizer.transform(featuresets) X = Normalizer().fit_transform(X) y = self.classifier.predict(X) return [self.inverse_label_vectorizer[cls] for cls in y]
def classify(self, featureset): X = self.feature_vectorizer.transform([featureset]) X = Normalizer().fit_transform(X) y = self.classifier.predict(X) assert(len(y) == 1) return self.inverse_label_vectorizer[y[0]]
def l2_norm(dataset, **kwargs): return prep.Normalizer(norm='l2', copy=True).fit_transform(dataset)
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples. Returns ------- self : detector Return self. """ X = check_array(X) if not self.assume_normalized: self._normalizer = Normalizer().fit(X) X = self._normalizer.transform(X) mean = np.mean(X, axis=0) self.mean_direction_ = mean / np.linalg.norm(mean) self.y_score_ = self.anomaly_score(X) df, loc, scale = chi2.fit(self.y_score_) self.threshold_ = chi2.ppf(1.0 - self.fpr, df, loc, scale) return self
def __init__(self): self.scaler = preprocessing.StandardScaler() self.normer = preprocessing.Normalizer()
def featuresByLSA(features,ncomponents=100): svd = TruncatedSVD(n_components=ncomponents) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) dtm_lsa = lsa.fit_transform(features) return dtm_lsa
def test_Normalizer(): ''' test the method :return: None ''' X=[ [1,2,3,4,5], [5,4,3,2,1], [1,3,5,2,4,], [2,4,1,3,5] ] print("before transform:",X) normalizer=Normalizer(norm='l2') print("after transform:",normalizer.transform(X))
def test_normalizer(): from sklearn.preprocessing import Normalizer arr = np.array([[3, -1], [-4, 2]]) print Normalizer().fit_transform(arr) # [[ 0.9486833 -0.31622777] # [-0.89442719 0.4472136 ]]
def get_pipeline_builder(): pipe_builder = PipelineBuilder() # Feature Extraction params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]} pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params) params = {} pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params) params = {} pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params) # Dimension Reduction params = {} pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params) params = {} pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params) # Normalization params = {} pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params) params = {} pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params) # Classification Models params = {} pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params) params = {} pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params) params = {} pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params) params = {} pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params) return pipe_builder
def make_k_means_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'K_MEANS/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) km = KMeans(n_clusters=self.kmeans_cluster_count, init='k-means++', max_iter=100, n_init=10) km.fit(X) predict_result = km.predict(X) self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.signals.PrintInfo.emit('') self.signals.PrintInfo.emit('?????? ?????????:') for index, cluster_center in enumerate(km.cluster_centers_): self.signals.PrintInfo.emit(' ' + str(index) + ':' + str(cluster_center)) self.draw_clusters_plot(X, predict_result, short_filenames)
def make_dbscan_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'DBSCAN/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("?????? TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) db = DBSCAN(eps=self.dbscan_eps, min_samples=self.dbscan_min_pts) predict_result = db.fit_predict(X) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('??????? ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' clasters_output += ('??????? ???????? (-1):\n') for predict, document in zip(predict_result, short_filenames): if predict == -1: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def new(stop_words=[],decomposition='SVD',n_components=5): # Prepare vectoriser engines idf = TfidfVectorizer( ngram_range=(1,3), #Unigram,bigram,& trigram stop_words=stop_words ) # Prepare normaliser norm = Normalizer(norm='max') print(colored('Texthasher model created','yellow')) # Prepare dimensionality reduction if decomposition and n_components: if decomposition=='LDA': # Results in Non-negative matrix reducer = LatentDirichletAllocation( # TFIDF --> Topic term n_topics=n_components, max_doc_update_iter=20, max_iter=8 ) return [idf,norm,reducer] elif decomposition=='SVD': reducer = TruncatedSVD( # Best for small dataset, n_components, # nightmare for large dataset n_iter=8) # Damn slow return [idf,norm,reducer] elif decomposition=='PCA': # When using IPCA, remember to always keep: # n_samples > n_components > batch_size # reducer = IncrementalPCA(n_components) # Sparse -> Dense greedily consumes large amount of mem # to_dense = SparseToDense() # return [idf,norm,to_dense,reducer] reducer = SparsePCA(n_components) return [idf,norm,reducer] return [idf,norm] else: return [idf,norm]
def __init__(self, num_class=2): """ :type num_classes: int :rtype: None """ self.__ctrl__ = None self.__case__ = None with open('../../.dbname', 'r') as f: self.__DB_NAME__ = json.load(f)['dbname'] self.__MG_DOCS_COLL__ = 'raw-docs' # raw docs self.__MG_SENTS_COLL__ = 'bag-of-sents' # raw sentences self.__MG_TOKENS_COLL__ = 'sample-tokens' # clean tokens (words) self.__PG_STATS_TBL__ = 'stats' # stylometric features self.__PG_RESULTS_TBL__ = 'results_' + \ str(num_class) + \ 'class' # cross val results self.__PG_PROBAS_TBL__ = 'probabilities' # cross val probabilities self.__model__ = Pipeline([ \ # ('scaler2', StandardScaler()), # ('scaler', MinMaxScaler()), # ('scaler3', Normalizer()), ('classifier', SVC(probability=True, kernel='poly', degree=2, class_weight='balanced') \ if num_class-1 \ else OneClassSVM(kernel='rbf', nu=0.7, gamma=1./250)) ]) print 'Instantiated classifier %s.' % \ self.__model__.named_steps['classifier'].__class__.__name__ self.__io__ = DBIO(MG_DB_NAME=self.__DB_NAME__, PG_DB_NAME=self.__DB_NAME__) self.__tagger__ = None # initialise if re-creating samples self.__bootstrap__ = None # initialise in fit