Python sklearn.feature_extraction 模块,DictVectorizer() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.DictVectorizer()

项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def onehot_encode_bar(tr,te,cols=None,bar=10000):
    if cols is None:
        cols = [i for i in tr.columns.values if i in te.columns.values]
    vec = DictVectorizer()
    cat,num = [],[]
    for col in cols:
        nu = tr[col].unique().shape[0]
        if (nu<bar and nu>2) or tr[col].dtype=='object':
            cat.append(col)
            tr[col] = tr[col].map(str)
            te[col] = te[col].map(str)
        else:
            num.append(col)
    print("start fitting num of cat features:",len(cat))
    X = vec.fit_transform(tr[cat].T.to_dict().values())
    Xt = vec.transform(te[cat].T.to_dict().values())
    print("done fitting",X.shape,Xt.shape)
    X = sparse.hstack([X,tr[num].values],format='csr')
    Xt = sparse.hstack([Xt,te[num].values],format='csr') 
    return X,Xt
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating
            hyper-parameters

        """
        classifier = a_clf
        self._gs = a_grid_search
        if a_clf is None:
            classifier = XGBClassifier(max_depth=MAX_DEPTH,
                                       n_estimators=NTREES,
                                       learning_rate=ALPHA,
                                       objective="multi:softprob")
            self._clf = classifier
        # latest version of XGBoost cannot deal with non-sparse feature vectors
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        dv = transformation_pipeline.named_steps['dv']

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            dv.restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def ohEncoding(data, cols, replace=False):
    if cols is None:
        cols = []
        for el, v in data.dtypes.iteritems():
            if v == 'object':
                if el == 'key':
                    pass
                else:
                    cols.append(el)
        print "Categorical features not set, detected as categorical: %s" % str(cols)
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec


# df, t, v = ohEncoding(df, col, replace=True)
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def ohEncoding(data, cols=None, replace=False):
    if cols is None:
        cols = []
        for el, v in data.dtypes.iteritems():
            if v == 'object':
                cols.append(el)
        print "Categorical features not set, detected as categorical: %s" % str(cols)
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Initialize classifier.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating hyper-parameters

        """
        classifier = a_clf or LinearSVC(C=DFLT_C,
                                        **DFLT_PARAMS)
        self._gs = a_grid_search
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])
项目:SPHERE-HyperStream    作者:IRC-SPHERE    | 项目源码 | 文件源码
def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def predict_function():
    x_list = []
    line_list = []
    line_dict = {}
    predict_doc = joblib.load('logreg.pkl')
    feature_doc = joblib.load("word_vec.pkl")
    y_train, x_train = get_feature()
    line = "bad bad good good"
    line_list = line.split()
    for line in x_train:
        for key in line:
            line_dict[key] = 0
    line_dict.update(dict(Counter(line_list)))
    for a in sorted(line_dict.items(), key = lambda x:x[1]):
        print(a)
    x_list.append(line_dict)
    print(x_list)
    exit()
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    for pred, prob in zip(pred,prob):
        print(pred, prob)
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def dimension_reduction():
    X = PPMI_matrix()
    word_list = list()
    vecdict_list = list()
    for word, vector in sorted(X.items()):
        word_list.append(word)
        vecdict_list.append(dict(vector))
    Dic2Vec = DictVectorizer(sparse=True)
    vector_list = Dic2Vec.fit_transform(vecdict_list)

    X_svd = svds(vector_list, 300)
    X_pca = np.dot(X_svd[0], np.diag(X_svd[1]))
    word_matrix = dict()
    for word, vector in zip(word_list, X_pca):
        word_matrix[word] = vector

    return word_matrix
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def predict_function():
    x_list = []
    line_list = []
    line_dict = {}
    predict_doc = joblib.load('logreg.pkl')
    feature_doc = joblib.load("word_vec.pkl")
    y_train, x_train = get_feature()
    line = "bad bad good good"
    line_list = line.split()
    for line in x_train:
        for key in line:
            line_dict[key] = 0
    line_dict.update(dict(Counter(line_list)))
    for a in sorted(line_dict.items(), key = lambda x:x[1]):
        print(a)
    x_list.append(line_dict)
    print(x_list)
    exit()
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    for pred, prob in zip(pred,prob):
        print(pred, prob)
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def dimension_compression():
    X_t_c = make_matrix()
    token_list = []
    contexts_list = []
    for token, contexts in sorted(X_t_c.items()):
        token_list.append(token)
        contexts_list.append(contexts)

    pca = PCA(n_components = 300)
    DictoVec = DictVectorizer(sparse = True)

    sparse = DictoVec.fit_transform(contexts_list)

    print(sparse.shape)

    vec_list = pca.fit_transform(sparse.todense())

    word_vec = {}
    for token, vec in zip(token_list, vec_list):
        word_vec[token] = vec

    return word_vec
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def dim_reduction():
  dic2vec = DictVectorizer(sparse=True)
  PPMI = getPPMI()
  tc = list()
  token_list = list()
  for token, contexts in sorted(PPMI.items()):
    token_list.append(token)
    contexts = dict(contexts)
    tc.append(contexts)

  tc_vec = dic2vec.fit_transform(tc)
  tc_svd = svds(tc_vec, 300)
  tc_pca = np.dot(tc_svd[0], np.diag(tc_svd[1]))

  word_vec = dict()
  for token, vec in zip(token_list, tc_pca):
    word_vec[token] = vec

  return word_vec
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection
项目:MLLearning    作者:buptdjd    | 项目源码 | 文件源码
def data2Vector(self):
        vec = DictVectorizer()
        dummy_x = vec.fit_transform(self.feature_list).toarray()
        lb = LabelBinarizer()
        dummy_y = lb.fit_transform(self.label_list)
        return dummy_x, dummy_y

    # here the decision tree use the algorithm which we call ID3, ID3 will use
    # information gain as feature select
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:partisan-discourse    作者:DistrictDataLabs    | 项目源码 | 文件源码
def transform(self, documents):
        """
        Returns a dictionary of text features in advance of a DictVectorizer.
        """
        for document in documents:
            # Collect token and vocabulary counts
            counts = Counter(
                item[0] for para in document for sent in para for item in sent
            )

            # Yield structured information about the document
            yield {
                'paragraphs': len(document),
                'sentences': sum(len(para) for para in document),
                'words': sum(counts.values()),
                'vocab': len(counts),
            }


##########################################################################
## Model Building Functions
##########################################################################
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def create_feature(sent_list):
    feature_ = []
    polarity = []
# ??????
    features_ = []
# ??????
#?????
    vec = DictVectorizer()

    for line in sent_list:
        sentence = line.strip('\n').split()
        sentence2 = sentence.pop(0)
        polarity.append(int(sentence2))
        #print(polarity)
        feature_ = feature(sentence)
        '''
        for word in feature(sentence):
            feature_.append(word)
            print(feature_)
            '''
        features_.append(feature_vector(feature_))
    x_feature = vec.fit_transform(features_)
    return x_feature, polarity
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:nlpSentiment    作者:ClimbsRocks    | 项目源码 | 文件源码
def getFeatures(numWordsToUse, allTweets, allTweetsSentiment):
    # each corpus's getFeatures function is responsible for somehow loading in their own allTweets and allTweetsSentiment data
    # then they have to ensure that data is tokenized (leveraging the modular tokenization functionality in utils)
    # then shuffle the dataset
    # then create the frequency distribution and popularWords
    # then extract features from each tweet, and un-combine the sentiment again


    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            allTweets, allTweetsSentiment,0,numWordsToUse,'counts'
        )

    # right now we have a data structure roughly equivalent to a dense matrix, except each row is a dictionary
    # DictVectorizer performs two key functions for us:
        # 1. transforms each row from a dictionary into a vector using consistent placing of keys into indexed positions within each vector
        # 2. returns sparse vectors, saving enormous amounts of memory which becomes very useful when training our models
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def onehot_encode(tr,te,cols=None):
    if cols is None:
        cols = [i for i in tr.columns.values if i in te.columns.values]
    vec = DictVectorizer()
    for col in cols:
        tr[col] = tr[col].map(str)
        te[col] = te[col].map(str)
    print("start fitting")
    X = vec.fit_transform(tr[cols].T.to_dict().values())
    Xt = vec.transform(te[cols].T.to_dict().values())
    print("done fitting",X.shape,Xt.shape)
    return X,Xt
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def _validate_input_col_descriptions(self):
        found_output_column = False
        self.cols_to_ignore = []
        expected_vals = set(['categorical', 'text', 'nlp'])

        for key, value in self.column_descriptions.items():
            value = value.lower()
            self.column_descriptions[key] = value
            if value == 'output':
                self.output_column = key
                found_output_column = True
            elif value == 'date':
                self.date_cols.append(key)
            elif value == 'ignore':
                self.cols_to_ignore.append(key)
            elif value in expected_vals:
                pass
            else:
                raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
        if found_output_column is False:
            print('Here is the column_descriptions that was passed in:')
            print(self.column_descriptions)
            raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')

        # We will be adding one new categorical variable for each date col
        # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
        for date_col in self.date_cols:
            self.column_descriptions[date_col + '_day_part'] = 'categorical'


    # We use _construct_pipeline at both the start and end of our training.
    # At the start, it constructs the pipeline from scratch
    # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def chiSquare(train_data, train_classes, topK):
    vectorizer = DictVectorizer()  

    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    y_train = train_classes

    if (x_train.shape[1] < topK):
        topK = x_train.shape[1]

    selector = SelectKBest(chi2, k=topK)
    x_new = selector.fit_transform(x_train, y_train)

    return vectorizer.inverse_transform(selector.inverse_transform(x_new))
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def varianceFilter(train_data, train_classes, threshold):
    #if True:
    #    return frequencyFilter(train_data, train_classes, threshold)
    '''
    Variance filter
    '''
    vectorizer = DictVectorizer()  
    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    #y_train = train_classes

    sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
    x_new = sel.fit_transform(x_train)
    return vectorizer.inverse_transform(sel.inverse_transform(x_new))
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()):

    labels = [x.severity for x in data]

    generatePrimaryFeats(data, featTypes)

    featurized = []
    for d in data:
        instance = {}
        for featname, values in d.feats.items():
            # Give each feature a unique name to avoid overwriting features.
            # If e.g. a concept feature has the same name as a bow word, the old code
            # would overwrite one of the features.
            instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()})

        featurized.append(instance)

    d = DictVectorizer()
    x_train = d.fit_transform(featurized)

    folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed)
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds)
    fit_grid = grid.fit(x_train, labels)

    print(fit_grid.best_params_)
    return fit_grid.best_params_
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs):
    new_train_set = list(trainSet)
    new_y_train = list(y_train)

    trainAndBSData = trainSet + bootstrap_data

    generateDataDrivenFeats(trainSet, trainAndBSData, es)

    featurized = featurize(trainAndBSData)

    train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
    test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)]

    #Do feature selection on train data
    train_feats = fs.runFeatureSelection(train_feats, y_train, es)
    train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es)

    # calculate Inter-annotator weighting. 
    weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot)

    vectorizer = DictVectorizer()   
    x_train = vectorizer.fit_transform(train_feats)
    x_test = vectorizer.transform(test_feats)

    if es.scaleData:
        min_max_scalar = MinMaxScaler()
        x_train = min_max_scalar.fit_transform(x_train.toarray())
        x_test = min_max_scalar.transform(x_test.toarray())

    model = train(estimator, x_train, y_train, weights_train, model=None)

    y_pred_prob = model.predict_proba(x_test)
    for i, cur_y in enumerate(y_pred_prob):
        if np.max(cur_y) > th_bs:
            new_train_set.append(bootstrap_data[i])
            new_y_train.append(np.argmax(cur_y))

    return (new_train_set, new_y_train) #update none to confidence vector
项目:kindred    作者:jakelever    | 项目源码 | 文件源码
def _vectorize(self,corpus,fit):
        assert isinstance(corpus,kindred.Corpus)

        matrices = []
        for feature in self.chosenFeatures:
            assert feature in self.featureInfo.keys()
            featureFunction = self.featureInfo[feature]['func']
            never_tfidf = self.featureInfo[feature]['never_tfidf']
            data = featureFunction(corpus)
            notEmpty = any( len(d)>0 for d in data )
            if fit:
                if notEmpty:
                    self.dictVectorizers[feature] = DictVectorizer()
                    if self.tfidf and not never_tfidf:
                        self.tfidfTransformers[feature] = TfidfTransformer()
                        intermediate = self.dictVectorizers[feature].fit_transform(data)
                        matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].fit_transform(data))
            else:
                if feature in self.dictVectorizers:
                    if self.tfidf and not never_tfidf:
                        intermediate = self.dictVectorizers[feature].transform(data)
                        matrices.append(self.tfidfTransformers[feature].transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].transform(data))

        mergedMatrix = hstack(matrices)
        return mergedMatrix
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def ohEncoding(data, cols, replace=False):
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec
项目:bionlp17    作者:leebird    | 项目源码 | 文件源码
def __init__(self, name, warm_start=True):
        self.vocal = DictVectorizer()
        self.model = linear_model.LogisticRegression(warm_start=warm_start,
                                                     solver='sag',
                                                     max_iter=200,
                                                     verbose=0,
                                                     penalty='l2',
                                                     n_jobs=4)
项目:whereami    作者:kootenpv    | 项目源码 | 文件源码
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
    return make_pipeline(DictVectorizer(sparse=False), clf)
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def predict_function():
    x_list = []
    predict_doc = joblib.load('logreg.pkl')
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    return pred, y_train, prob
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def log_regression():
    x_list = []
    logreg = LogisticRegression()
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    word_vec = DictVectorizer()
    X = word_vec.fit_transform(x_list)
    logreg.fit(X, y_train)
    joblib.dump(logreg, 'logreg.pkl')
    joblib.dump(word_vec,"word_vec.pkl")
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def cv_prediction(feature_dict, feature, polarity, threshold, folds):
    accuracy = 0
    precision = 0
    recall = 0
    f1 = 0
    count = 0
    dicvec = DictVectorizer()
    LR = LogisticRegression()
    kfold = KFold(len(polarity), n_folds=folds)
    for train, test in kfold:
        count += 1
        x = list()
        y = list()
        [x.append(feature[i]) for i in train]
        [y.append(polarity[i]) for i in train]
        x.append(feature_dict)
        y.append(0)
        LR.fit(dicvec.fit_transform(x), y)
        test_label = list()
        answer_label = list()
        [answer_label.append(polarity[j]) for j in test]
        for j in test:
            query = fit_feature(feature[j], feature_dict)
            result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold)
            test_label.append(result)
        accuracy += accuracy_score(answer_label, test_label)
        precision += precision_score(answer_label, test_label)
        recall += recall_score(answer_label, test_label)
        f1 += f1_score(answer_label, test_label)
        print('{}_fold finished.'.format(count))

    return accuracy, precision, recall, f1
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def predict_function():
    x_list = []
    predict_doc = joblib.load('logreg.pkl')
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    return pred, y_train, prob
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def log_regression():
    x_list = []
    logreg = LogisticRegression()
    y_train, x_train = get_feature()
    for line in x_train:
        x_list.append(dict(Counter(line)))
    word_vec = DictVectorizer()
    X = word_vec.fit_transform(x_list)
    logreg.fit(X, y_train)
    joblib.dump(logreg, 'logreg.pkl')
    joblib.dump(word_vec,"word_vec.pkl")
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def main():
    lr = joblib.load('./lr.pkl')
    dic2vec = DictVectorizer()
    features = list()
    y = list()
    for line in open('sentiment.txt'):
        word_list = line[3:].strip('\n').strip().split()
        features.append(getFeature(word_list))
    x = dic2vec.fit_transform(features)
    with open('sentiment_prediction.txt', 'w') as fp:
        for sentiment, prob in zip(lr.predict(x), lr.predict_proba(x)):
            print('{}\t{}'.format(sentiment, prob), file=fp)
项目:pandas-pipelines-custom-transformers    作者:jem1031    | 项目源码 | 文件源码
def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_dictvectorizer(self):

        D = [{"foo": 1, "bar": 3},
             {"bar": 4, "baz": 2},
             {"bar": 1, "quux": 1, "quuux": 2}]

        for sparse in (True, False):
            for dtype in (int, np.float32, np.int16):
                for sort in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    v = v.fit(D)
                    self._test_conversion(D, v)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_unseen_or_no_features(self):
        D1 = [{"camelot": 0, "spamalot": 1}]
        D2 = [{}, {"nothing" : 21}]

        for sparse in (True, False):
            for dtype in (int, np.float32, np.int16):
                for sort in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    v = v.fit(D1) 
                    self._test_conversion(D2, v)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_int_features_in_pipeline(self): 

        import numpy.random as rn
        import pandas as pd
        rn.seed(0)

        x_train_dict = [ dict( (rn.randint(100), 1) 
                          for i in range(20)) 
                            for j in range(100)]
        y_train = [0,1]*50

        from sklearn.pipeline import Pipeline
        from sklearn.feature_extraction import DictVectorizer
        from sklearn.linear_model import LogisticRegression

        pl = Pipeline([("dv", DictVectorizer()),  ("lm", LogisticRegression())])
        pl.fit(x_train_dict, y_train)

        import coremltools

        model = coremltools.converters.sklearn.convert(pl, input_features = "features", output_feature_names = "target")

        x = pd.DataFrame( {"features" : x_train_dict, 
                           "prediction" : pl.predict(x_train_dict)})

        cur_eval_metics = evaluate_classifier(model, x)
        self.assertEquals(cur_eval_metics['num_errors'], 0)
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def _validate_input_col_descriptions(self):
        found_output_column = False
        self.cols_to_ignore = []
        expected_vals = set(['categorical', 'text', 'nlp'])

        for key, value in self.column_descriptions.items():
            value = value.lower()
            self.column_descriptions[key] = value
            if value == 'output':
                self.output_column = key
                found_output_column = True
            elif value == 'date':
                self.date_cols.append(key)
            elif value == 'ignore':
                self.cols_to_ignore.append(key)
            elif value in expected_vals:
                pass
            else:
                raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
        if found_output_column is False:
            print('Here is the column_descriptions that was passed in:')
            print(self.column_descriptions)
            raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')

        # We will be adding one new categorical variable for each date col
        # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
        for date_col in self.date_cols:
            self.column_descriptions[date_col + '_day_part'] = 'categorical'

        self.cols_to_ignore = set(self.cols_to_ignore)


    # We use _construct_pipeline at both the start and end of our training.
    # At the start, it constructs the pipeline from scratch
    # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it