Python sklearn.preprocessing 模块,Normalizer() 实例源码

我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用sklearn.preprocessing.Normalizer()

项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_pipeline(self): 
        data = load_boston()

        for categorical_features in [ [3], [8], [3, 8], [8,3] ]:

            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct. 

            model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)),
                 ("Normalizer", Normalizer())])

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, 'out').get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out" : row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)
项目:document_classification    作者:scotthlee    | 项目源码 | 文件源码
def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
    svd = TruncatedSVD(n_features)  
    if normalize:   
        if flip:
            lsa = make_pipeline(svd, Normalizer(copy=False))
            doc_mat = lsa.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            lsa = make_pipeline(svd, Normalizer(copy=False))        
            doc_mat = lsa.fit_transform(doc_vecs)
        return doc_mat
    else:
        if flip:
            doc_mat = svd.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            doc_mat = svd.fit_transform(doc_vecs)
        return doc_mat
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def train(labeled_featuresets, C=1e5):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        feat = [featureset for featureset, label in labeled_featuresets]
        feature_vectorizer = MVectorizer.DictsVectorizer()
        X = feature_vectorizer.fit_transform(feat)
        X = Normalizer().fit_transform(X)
        label_set = set( [label for featureset, label in labeled_featuresets] )
        label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] )
        y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets])
        # print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]),
        classifier = OneVsRestClassifier(LinearSVC(loss='squared_hinge', penalty='l2', dual=True, tol=1e-5, C=C))
        classifier.fit(X,y)
        # print "done"

        return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def pool(biz_dict, vlad_dict, mode):
    if mode == 'train':
        y_dict = read_y()
    y = np.zeros((0, 9))
    x = np.array([])
    x_vlad = np.array([])

    for key, value in sorted(biz_dict.items()):
        avg = np.array(value).sum(axis=0) / len(value)
        vlad = vlad_dict.get(key)
#        vlad = preprocessing.normalize(vlad)
#        print(vlad.shape)
#        feat = np.concatenate([avg, vlad], axis=0)
#        feat = preprocessing.Normalizer().fit_transform(feat)
#        feat = avg
        x = np.vstack((x, avg)) if x.size else avg
        x_vlad = np.vstack((x_vlad, vlad)) if x_vlad.size else vlad

        if mode == 'train':
            y = np.vstack((y, y_dict.get(key)))        
    return (x, x_vlad, y) if mode == 'train' else (x, x_vlad)
项目:onionstack    作者:ntddk    | 项目源码 | 文件源码
def main():
    features = []

    for i in list:
        im = cv2.imread(i)
        hist, bins = np.histogram(im.ravel(), 256, [0, 256])
        features.append(hist)

    lsa = TruncatedSVD(10)
    features = lsa.fit_transform(features)
    features = Normalizer(copy = False).fit_transform(features)

    km = KMeans(
        init='k-means++',
        n_clusters=n_clusters,
    )
    km.fit(features)

    for i in range(n_clusters):
        if not os.path.exists('./result/' + str(i)):
            os.makedirs('./result/' + str(i))

    cnt = 0

    for i in list:
        filename = i.split('/')[-1]
        print filename,
        print km.labels_[cnt]
        shutil.copyfile(i, './result/' +  str(km.labels_[cnt]) + '/' + filename)
        cnt += 1
项目:coremltools    作者:gsabran    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a normalizer model to the protobuf spec.

    Parameters
    ----------
    model: Normalizer
        A Normalizer.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """

    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Normalizer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features, output_features)

    # Set the one hot encoder parameters
    _normalizer_spec = spec.normalizer
    if model.norm == 'l1':
        _normalizer_spec.normType = _proto__normalizer.L1
    elif model.norm == 'l2':
        _normalizer_spec.normType = _proto__normalizer.L2
    elif model.norm == 'max':
        _normalizer_spec.normType = _proto__normalizer.LMax
    return _MLModel(spec)
项目:nlp-lt    作者:minven    | 项目源码 | 文件源码
def truncated_svd(self):
        # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
        svd = TruncatedSVD(self.dimensions)   
        lsa = make_pipeline(svd, Normalizer(copy=False))
        X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
        print(svd.components_[0])
        print(svd.explained_variance_ratio_) 
        print(svd.explained_variance_ratio_.sum())
项目:FLASH    作者:yuyuz    | 项目源码 | 文件源码
def get_data_preprocessor_rescaling(params):
    dpr = None
    d_rescaling = params['layer_dict_list'][0]

    if params['rescaling'] == str(d_rescaling['None']) or params['rescaling'] == 'None':
        dpr = None
    elif params['rescaling'] == str(d_rescaling['MinMax']) or params['rescaling'] == 'MinMax':
        dpr = MinMaxScaler()
    elif params['rescaling'] == str(d_rescaling['Standardize']) or params['rescaling'] == 'Standardize':
        dpr = StandardScaler()
    elif params['rescaling'] == str(d_rescaling['Normalize']) or params['rescaling'] == 'Normalize':
        dpr = Normalizer()

    return dpr
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a normalizer model to the protobuf spec.

    Parameters
    ----------
    model: Normalizer
        A Normalizer.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """

    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Normalizer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features, output_features)

    # Set the one hot encoder parameters
    _normalizer_spec = spec.normalizer
    if model.norm == 'l1':
        _normalizer_spec.normType = _proto__normalizer.L1
    elif model.norm == 'l2':
        _normalizer_spec.normType = _proto__normalizer.L2
    elif model.norm == 'max':
        _normalizer_spec.normType = _proto__normalizer.LMax
    return _MLModel(spec)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_random(self):
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size = (50, 3))

        for param in ('l1', 'l2', 'max'):

            cur_model= Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", 'b', 'c'], 'out')

            metrics = evaluate_transformer(spec, 
                    [dict(zip(["a", "b", "c"], row)) for row in X], 
                    [{"out" : row} for row in output])
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston(self):
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Normalizer(norm='l2').fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names, 'out')

        input_data = [dict(zip(scikit_data.feature_names, row)) 
                for row in scikit_data.data]

        output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)]

        evaluate_transformer(spec, input_data, output_data)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_ward_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'WARD/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
        predict_result = ward.fit_predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_spectral_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'spectral/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
        predict_result = spectral.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

    # aa = Affinity Propagation
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_aa_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'affinity_propagation/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
                                               max_iter=self.aa_max_iter,
                                               convergence_iter=self.aa_no_change_stop)

        predict_result = aa_clusterizator.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def avg_spelling_error(lang=None):
    pipeline = Pipeline([('feature', SpellingError(language=lang)),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('avg_spelling_error', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def punctuation_features():
    pipeline = Pipeline([('feature', PunctuationFeatures()),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('punctuation_features', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_bigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
                                                  ngram_range=(2, 2))),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_bigrams', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def char_ngrams():
    vectorizer = CountVectorizer(min_df=1,
                                 preprocessor=TextCleaner(filter_urls=True,
                                                          filter_mentions=True,
                                                          filter_hashtags=True,
                                                          lowercase=False),
                                 analyzer='char_wb',
                                 ngram_range=(4, 4))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('char_ngrams', pipeline)
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def doPCA(X, output_columns_count):
    #DO PCA on the data and use it to transform
    svd = TruncatedSVD(output_columns_count)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)
    return X
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def normalize(matrix):
  '''Normalize each row (L2-norm) of a CSR sparse matrix (it should work with most sparse matrices though)'''
  sparsy = matrix.tocoo()
  data = [float(d) for d in sparsy.data]
  return Normalizer().transform(csr_matrix((data, (sparsy.row, sparsy.col))))


#
# Simple tests
#
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def batch_classify(self, featuresets):
        X = self.feature_vectorizer.transform(featuresets)
        X = Normalizer().fit_transform(X)
        y = self.classifier.predict(X)
        return [self.inverse_label_vectorizer[cls] for cls in y]
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def classify(self, featureset):
        X = self.feature_vectorizer.transform([featureset])
        X = Normalizer().fit_transform(X)
        y = self.classifier.predict(X)
        assert(len(y) == 1)
        return self.inverse_label_vectorizer[y[0]]
项目:AppsOfDataAnalysis    作者:nhanloukiala    | 项目源码 | 文件源码
def l2_norm(dataset, **kwargs):
    return prep.Normalizer(norm='l2', copy=True).fit_transform(dataset)
项目:kenchi    作者:Y-oHr-N    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples.

        Returns
        -------
        self : detector
            Return self.
        """

        X                    = check_array(X)

        if not self.assume_normalized:
            self._normalizer = Normalizer().fit(X)
            X                = self._normalizer.transform(X)

        mean                 = np.mean(X, axis=0)
        self.mean_direction_ = mean / np.linalg.norm(mean)

        self.y_score_        = self.anomaly_score(X)
        df, loc, scale       = chi2.fit(self.y_score_)
        self.threshold_      = chi2.ppf(1.0 - self.fpr, df, loc, scale)

        return self
项目:mars_express    作者:wsteitz    | 项目源码 | 文件源码
def __init__(self):
        self.scaler = preprocessing.StandardScaler()
        self.normer = preprocessing.Normalizer()
项目:DocumentClassification    作者:bahmanh    | 项目源码 | 文件源码
def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_Normalizer():
    '''
    test the method
    :return: None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [1,3,5,2,4,],
          [2,4,1,3,5] ]
    print("before transform:",X)
    normalizer=Normalizer(norm='l2')
    print("after transform:",normalizer.transform(X))
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def test_normalizer():
    from sklearn.preprocessing import Normalizer
    arr = np.array([[3, -1],
                    [-4, 2]])

    print Normalizer().fit_transform(arr)
    # [[ 0.9486833  -0.31622777]
    #  [-0.89442719  0.4472136 ]]
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def get_pipeline_builder():

    pipe_builder = PipelineBuilder()

    # Feature Extraction
    params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]}
    pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)

    params = {}
    pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)

    # Dimension Reduction
    params = {}
    pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)

    # Normalization
    params = {}
    pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)

    # Classification Models
    params = {}
    pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params)

    params = {}
    pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)

    params = {}
    pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)

    return pipe_builder
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_k_means_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'K_MEANS/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        km = KMeans(n_clusters=self.kmeans_cluster_count, init='k-means++', max_iter=100, n_init=10)
        km.fit(X)

        predict_result = km.predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)

        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.signals.PrintInfo.emit('')
        self.signals.PrintInfo.emit('?????? ?????????:')
        for index, cluster_center in enumerate(km.cluster_centers_):
            self.signals.PrintInfo.emit('  ' + str(index) + ':' + str(cluster_center))

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_dbscan_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'DBSCAN/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        db = DBSCAN(eps=self.dbscan_eps, min_samples=self.dbscan_min_pts)
        predict_result = db.fit_predict(X)
        db.fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'

        clasters_output += ('??????? ???????? (-1):\n')
        for predict, document in zip(predict_result, short_filenames):
            if predict == -1:
                clasters_output += ('  ' + str(document) + '\n')
        clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)

        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:pantip-libr    作者:starcolon    | 项目源码 | 文件源码
def new(stop_words=[],decomposition='SVD',n_components=5):

  # Prepare vectoriser engines
  idf = TfidfVectorizer(
    ngram_range=(1,3), #Unigram,bigram,& trigram
    stop_words=stop_words
  )

  # Prepare normaliser
  norm = Normalizer(norm='max')

  print(colored('Texthasher model created','yellow'))

  # Prepare dimensionality reduction
  if decomposition and n_components:
    if decomposition=='LDA': # Results in Non-negative matrix
      reducer = LatentDirichletAllocation( # TFIDF --> Topic term
        n_topics=n_components,
        max_doc_update_iter=20,
        max_iter=8  
      )
      return [idf,norm,reducer]

    elif decomposition=='SVD':
      reducer = TruncatedSVD( # Best for small dataset, 
        n_components,         # nightmare for large dataset
        n_iter=8) # Damn slow

      return [idf,norm,reducer]

    elif decomposition=='PCA':
      # When using IPCA, remember to always keep:
      # n_samples > n_components > batch_size
      # reducer = IncrementalPCA(n_components)

      # Sparse -> Dense greedily consumes large amount of mem
      # to_dense = SparseToDense()

      # return [idf,norm,to_dense,reducer]

      reducer = SparsePCA(n_components)
      return [idf,norm,reducer]

    return [idf,norm]
  else:
    return [idf,norm]
项目:satoshi-mission    作者:lilychai    | 项目源码 | 文件源码
def __init__(self, num_class=2):
        """
        :type num_classes: int
        :rtype: None
        """

        self.__ctrl__ = None
        self.__case__ = None

        with open('../../.dbname', 'r') as f:
            self.__DB_NAME__ = json.load(f)['dbname']
        self.__MG_DOCS_COLL__   = 'raw-docs'           # raw docs
        self.__MG_SENTS_COLL__  = 'bag-of-sents'       # raw sentences
        self.__MG_TOKENS_COLL__ = 'sample-tokens'      # clean tokens (words)
        self.__PG_STATS_TBL__   = 'stats'              # stylometric features
        self.__PG_RESULTS_TBL__ = 'results_' + \
                                  str(num_class) + \
                                  'class'              # cross val results
        self.__PG_PROBAS_TBL__  = 'probabilities'      # cross val probabilities


        self.__model__ = Pipeline([ \
                                 # ('scaler2', StandardScaler()),
                                 # ('scaler', MinMaxScaler()),
                                 # ('scaler3', Normalizer()),
                                  ('classifier', SVC(probability=True,
                                                     kernel='poly',
                                                     degree=2,
                                                     class_weight='balanced') \
                                                 if num_class-1 \
                                            else OneClassSVM(kernel='rbf',
                                                             nu=0.7,
                                                             gamma=1./250))
                                 ])

        print 'Instantiated classifier %s.' % \
              self.__model__.named_steps['classifier'].__class__.__name__


        self.__io__ = DBIO(MG_DB_NAME=self.__DB_NAME__,
                           PG_DB_NAME=self.__DB_NAME__)

        self.__tagger__ = None     # initialise if re-creating samples
        self.__bootstrap__ = None  # initialise in fit