我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用sklearn.feature_selection.chi2()。
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithXsqure(anamolySample,normalSample,topk): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def de_c2(X,y): """ chi2 """ dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(chi2, k = de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1), clf.transform(X2) return _func # def de_mic(X,y): # """ MIC """ # dim = X.shape[1] # de = min(2000,dim) # clf = SelectKBest(MIC, k=de) # clf.fit(X,y) # def _func(X1,X2): # return clf.transform(X1),clf.transform(X2) # return _func
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample, normalSample]) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " print 'x2:',warnstr return warnstr
def ngrams_selection(train_data, train_labels, ind, model_file, ngram_range_=(1, 1), max_num_features=100, analyzer_type='word'): """Create and save vectorizers and feature selectors on given train data. Args: train_data: list of train text samples train_labels: list of train labels ind: index of vectorizer/selector to save file model_file: model filename ngram_range_: range of n-grams max_num_features: maximum number of features to select analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char' Returns: nothing """ vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type) X_train = vectorizer.fit_transform(train_data) if max_num_features < X_train.shape[1]: ch2 = SelectKBest(chi2, k=max_num_features) ch2.fit(X_train, train_labels) data_struct = {'vectorizer': vectorizer, 'selector': ch2} print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin') with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f: pickle.dump(data_struct, f) else: data_struct = {'vectorizer': vectorizer} print ('creating', model_file + '_ngrams_vect_' + ind + '.bin') with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f: pickle.dump(data_struct, f) return
def chiSquare(train_data, train_classes, topK): vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) y_train = train_classes if (x_train.shape[1] < topK): topK = x_train.shape[1] selector = SelectKBest(chi2, k=topK) x_new = selector.fit_transform(x_train, y_train) return vectorizer.inverse_transform(selector.inverse_transform(x_new))
def export_best_feature_names(self, df, labels, out_folder_path, k): columns, repos, observations = self.decompose_df(df) feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_ feature_scores = np.nan_to_num(feature_scores) k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:] k_best_feature_names = columns[k_best_features] out_file_path = os.path.join(out_folder_path, "feature_selection.txt") with open(out_file_path, "w") as output_file: for feature_name in k_best_feature_names: output_file.write(feature_name + "\n")
def __init__(self, conf): SemiSupervisedFeatureSelection.__init__(self, conf) self.projection = SelectKBest(chi2, k = conf.num_components)
def getFeature(): fileData = open("data") row = [] col = [] data = [] evalRes = [] rowIndex = -1 fileList = fileData.readlines() random.shuffle(fileList) for line in fileList: line = line.rstrip('\n') dataList = re.split(' |:', line) if int(dataList[0]) >= 7: evalRes.append(1) else: if int(dataList[0]) <= 4: evalRes.append(-1) else: continue del dataList[0] rowIndex = rowIndex + 1 row.extend([rowIndex] * int(len(dataList) / 2)) col.extend(map(int, dataList[::2])) data.extend(map(int, dataList[1::2])) featureMatrix = csr_matrix((data, (row, col))) featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes) return featureMNew, evalRes
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_regression(args.input, encoding='windows-1252') analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) test = read_test_data(args.test, encoding='windows-1252') regressor = pipeline.fit(data[0], data[1]) y = regressor.predict(test[2]) with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile: for id_, topic, rate in zip(test[0], test[1], y): print(id_, topic, rate, sep='\t', file=outfile)
def select_feats(X, y, nb_features, nb_features_to_keep=2048): X, y = preproc_for_sklearn(X, y, nb_features) if nb_features < nb_features_to_keep: nb_features_to_keep = nb_features_to_keep / 4 feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y) selected_indices = feature_selector.get_support(indices=True) return selected_indices
def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0): """ Classifies the data using K-nearest neighbors and k-fold CV :param X: The list of feature vectors :type X: list :param y: The list of labels corresponding to the feature vectors :type y: list :param K: The number of nearest neighbors to consider in classification :type K: int :param kfold: The number of folds in the CV :type kfold: int :param selectKBest: The number of best features to select :type selectKBest: int :return: An array of predicted classes """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = neighbors.KNeighborsClassifier(n_neighbors=K) # Select K Best features if enabled X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0): """ Trains a K-NN using the training data and tests it using the test data using K-fold cross validation :type X: list :param y: The labels corresponding to the training feature vectors :type y: list :param Xtest: The matrix of test feature vectors :type Xtest: list :param ytest: The labels corresponding to the test feature vectors :type ytest: list :param K: The number of nearest neighbors to consider in classification :type K: int :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the k-folds """ try: predicted, predicted_test = [], [] # Define classifier and cross validation iterator clf = neighbors.KNeighborsClassifier(n_neighbors=K) # Start the cross validation learning X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Select K Best features if enabled prettyPrint("Selecting %s best features from feature vectors" % selectKBest) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest # Fit model prettyPrint("Fitting model") clf.fit(X_new, y) # Validate and test model prettyPrint("Validating model using training data") predicted = clf.predict(X_new) prettyPrint("Testing model") predicted_test = clf.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0): """Classifies the data using Support vector machines with the SSK kernel and k-fold CV :param X: The list of text documents containing traces :type X: list :param y: The labels of documents in 'X' :type y: list :param kfold: The number of folds :type kfold: int (default: 10) :param subseqLength: Length of subsequence used by the SSK :type subseqLength: int (default: 3) :param selectKBest: The number of best features to select :type selectKBest: int :return: An array of predicted classes """ try: predicted = [] # Retrieve Gram Matrix from string kernel if verboseON(): prettyPrint("Generating Gram Matrix from documents", "debug") X_gram = string_kernel(X, X) y = numpy.array(y) # Define classifier clf = svm.SVC(kernel="precomputed") X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest)) predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10): """ Classifies the data using Support vector machines and k-fold CV :param X: The matrix of feature vectors :type X: list :param y: The vector containing the labels corresponding to feature vectors :type y: list :param kernel: The kernel used to elevate data into higher dimensionalities :type kernel: str :param C: The penalty parameter of the error term :type C: int :param selectKBest: The number of best features to select :type selectKBest: int :param kfold: The number of folds to use in K-fold CV :type kfold: int :return: A list of predicted labels across the k-folds """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = svm.SVC(kernel=kernel, C=C) # Select K Best features if enabled X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10): """ Classifies the data using decision trees and k-fold CV :param X: The matrix of feature vectors :type X: list :param y: The vector containing labels corresponding to the feature vectors :type y: list :param estimators: The number of random trees to use in classification :type estimators: int :param criterion: The splitting criterion employed by the decision tree :type criterion: str :param splitter: The method used to split the data :type splitter: str :param maxDepth: The maximum depth the tree is allowed to grow :type maxDepth: int :param selectKBest: The number of best features to select :type selectKBest: int :param kfold: The number of folds to use in K-fold CV :type kfold: int :return: A list of predicted labels across the k-folds """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def de_c2(X,y): dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(chi2, k = de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def get_local_words(word_count, threshold, y_train, train_seq, num_words): feature_index = delete_low_freq_words(word_count, threshold) print(len(train_seq), len(feature_index)) word_freq_matrix = np.zeros([len(train_seq), len(feature_index)]) for (seq_idx, seq) in enumerate(train_seq): word_freq_list = np.zeros(len(feature_index)) for word in seq: if (word not in feature_index): continue else: word_idx = feature_index[word] word_freq_matrix[seq_idx][word_idx] += 1 sk = SelectKBest(chi2, k="all") sk.fit_transform(csr_matrix(word_freq_matrix), y_train) score_list = sk.scores_ word_score = {} for (feature, idx) in feature_index.items(): word_score[feature] = score_list[idx] word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True) local_word_list = [] for (word, score) in word_score[:num_words]: local_word_list.append(word) del word_freq_matrix return local_word_list
def featuresByChiSq(features,labels,nFeature=5000): chi2_model = SelectKBest(chi2,k=nFeature) dtm = chi2_model.fit_transform(features,labels) return dtm,chi2_model
def train_and_score(X, y): X_train, X_test, y_train, y_test = split_data(X, y) clf = Pipeline([ ('reduce_dim', SelectKBest(chi2, k=2)), ('train', LinearSVC(C=100)) ]) scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2) print("Mean Model Accuracy:", np.array(scores).mean()) clf.fit(X_train, y_train) confuse(y_test, clf.predict(X_test)) print()
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0,len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): data = anamolySample target = [] for i in range(0, len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0, len(name)): if outcome[i]: print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr
def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def basic_quality(self, target, feature_vector): assert (len(target) == len(feature_vector)) if self.quality == 'NWP': sort_data_p = np.array([x for (y,x) in sorted(zip(feature_vector, target), key=lambda x: x[0])]) sort_data_n = np.array([x for (y,x) in sorted(zip(-1.0 * feature_vector, target), key=lambda x: x[0])]) p_nwp = QualityMeasure.calc_nwp(sort_data_p) n_nwp = QualityMeasure.calc_nwp(sort_data_n) return min(n_nwp, p_nwp) if self.quality == 'corrcoef': return 1 - abs(np.corrcoef(target, feature_vector)[0][1]) if self.quality == 'mutual_info': m = MINE() m.compute_score(target, feature_vector) return 1.0 - m.mic() if self.quality == 'chi2': return 1 - chi2(abs(feature_vector.reshape(len(feature_vector), 1)), target)[0][0] if self.quality == 'distcorr': return 1 - distcorr(target, feature_vector) if self.quality == 'distree': data = np.column_stack((feature_vector, self.random_feature)) clf = DecisionTreeClassifier(max_depth=5, random_state=0) clf.fit(data, target) return 1.0 - clf.feature_importances_[0] if self.quality == 'knnscore': errors = [] clf = KNeighborsClassifier() data = np.array([feature_vector]).transpose() loo = LeaveOneOut() for train, test in loo.split(data): clf = KNeighborsClassifier() clf.fit(data[train], target[train]) errors.append(accuracy_score(target[test], clf.predict(data[test]))) return 1.0 - np.mean(errors) return 'WRONG QUALITY NAME'
def test_feature_selection(): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20) d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1) for indices in (True, False): v = DictVectorizer().fit([d1, d2]) X = v.transform([d1, d2]) sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) assert_equal(v.get_feature_names(), ["useful1", "useful2"])
def chi2_feature_test(X,y,feature_index): """ Performs the chi square test on the desired feature Keyword arguments: X -- The feature vectors y -- The target vector feature_index - The selected feature (a zero-based index) """ feature_column=X[:,feature_index].reshape(-1,1) min_val=feature_column.min() if min_val<0: feature_column=feature_column+min_val*-1+1 return chi2(feature_column,y)
def buildVectorizer(classes, examples, parameters): featureChoice = None doFeatureSelection = False tfidf = False featureSelectPerc = 10 if "featureChoice" in parameters: featureChoice = parameters["featureChoice"] if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True": doFeatureSelection = True if "featureSelectPerc" in parameters: featureSelectPerc = int(parameters["featureSelectPerc"]) if "tfidf" in parameters and parameters["tfidf"] == "True": tfidf = True print "Starting vectorizer..." vectorizer = Vectorizer(classes,examples,featureChoice,tfidf) vectors = vectorizer.getTrainingVectors() print "Vectors of size:", vectors.shape if doFeatureSelection: print "Trimming training vectors..." from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2 #featureSelector = SelectKBest(chi2, k=100)`: featureSelector = SelectPercentile(chi2,featureSelectPerc) vectorsTrimmed = featureSelector.fit_transform(vectors, classes) vectorsTrimmed = coo_matrix(vectorsTrimmed) print "Trimmed training vectors of size:", vectorsTrimmed.shape else: vectorsTrimmed = vectors featureSelector = None return vectorsTrimmed,vectorizer,featureSelector
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_quantification_regression(args.input, encoding='windows-1252') texts = list() labels = list() topics = list() for topic in data: topic_texts, topic_labels = data[topic] texts.extend(topic_texts) labels.extend(topic_labels) topics.extend([topic for _ in topic_labels]) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) _, test_topics, test_texts = read_test_data(args.test, encoding='windows-1252') quantifier = RegressionQuantifier(pipeline) quantifier.fit(texts, labels, topics) quantification = quantifier.predict(test_texts, test_topics) sorted_topics = list(quantification) sorted_topics.sort() with open('%sc%f-k%i-plain-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as plainfile, \ open('%sc%f-k%i-corrected_train-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_trainfile, \ open('%sc%f-k%i-corrected_test-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_testfile: for topic in sorted_topics: plain, corrected_train, corrected_test = quantification[topic] print(topic, *plain, sep='\t', file=plainfile) print(topic, *corrected_train, sep='\t', file=corrected_trainfile) print(topic, *corrected_test, sep='\t', file=corrected_testfile)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-b', '--binary', help='Polarity classification, i.e., posivitive vs negative (default: posivitive/negative/neutral classification)', action='store_true') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_classification(args.input, encoding='windows-1252') if args.binary: data = filter_polarity_classification(data) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', LinearSVC(C=args.c)), ]) pipeline.fit(data[0], data[1]) test = read_test_data(args.test, args.binary, encoding='windows-1252', topic=args.binary) classifier = pipeline.fit(data[0], data[1]) y = classifier.predict(test[1]) if args.binary: task = 'B' else: task = 'A' with open('%sc%f-k%i-%s.output' % (args.output, args.c, args.k, task), 'w', encoding='utf8') as outfile: if args.binary: for id_, topic, label in zip(test[0], test[2], y): print(id_, topic, label, sep='\t', file=outfile) else: for id_, label in zip(test[0], y): print(id_, label, sep='\t', file=outfile)
def predictAndTestEnsemble(X, y, Xtest, ytest, classifiers=[], selectKBest=0): """ Trains an Ensemble of classifiers (with default params) and using a training dataset, and returns majority vote using the same training dataset and an out-of-sample test dataset :type X: list :param y: The labels corresponding to the training feature vectors :type y: list :param Xtest: The matrix of test feature vectors :type Xtest: list :param ytest: The labels corresponding to the test feature vectors :type ytest: list :param classifiers: A list of classifiers to use in the ensemble :type classifiers: list of str :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the k-folds """ try: predicted, predicted_test = [], [] # Prepare the data X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Define classifiers ensembleClassifiers = [] for c in classifiers: if c.lower().find("knn") != -1: K = int(c.split('-')[-1]) clf = neighbors.KNeighborsClassifier(n_neighbors=K) elif c.lower().find("svm") != -1: clf = svm.SVC(kernel='linear', C=1) elif c.lower().find("forest") != -1: E = int(c.split('-')[-1]) clf = ensemble.RandomForestClassifier(n_estimators=E,) # Add to list ensembleClassifiers.append((c, clf)) # Select K Best features if applicable X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest # Train and fit the voting classifier voting = VotingClassifier(estimators=ensembleClassifiers, voting='hard') prettyPrint("Fitting ensemble model") voting = voting.fit(X_new, y) prettyPrint("Validating model") predicted = voting.predict(X_new) # Same for the test dataset prettyPrint("Testing the model") predicted_test = voting.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def predictAndTestRandomForest(X, y, Xtest, ytest, estimators=10, criterion="gini", maxdepth=None, selectKBest=0): """ Trains a tree using the training data and tests it using the test data using K-fold cross validation :param Xtr: The matrix of training feature vectors :type Xtr: list :param ytr: The labels corresponding to the training feature vectors :type ytr: list :param Xte: The matrix of test feature vectors :type yte: list :param estimators: The number of random trees to use in classification :type estimators: int :param criterion: The splitting criterion employed by the decision tree :type criterion: str :param maxdepth: The maximum depth the tree is allowed to grow :type maxdepth: int :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the 10 folds """ try: predicted, predicted_test = [], [] # Define classifier and cross validation iterator clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth) # Start the cross validation learning X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Select K Best features if enabled prettyPrint("Selecting %s best features from feature vectors" % selectKBest) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest # Fit model prettyPrint("Fitting model") clf.fit(X_new, y) # Validate and test model prettyPrint("Validating model using training data") predicted = clf.predict(X_new) prettyPrint("Testing model") predicted_test = clf.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def train(self): parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3), (2,3)], #'vect__binary': (True, False), 'clf__alpha': (1e-2, 1e-3, 1e-1, 1e-4, 1e-5), 'clf__loss': ('hinge', 'log'), 'clf__penalty': ('l2', 'l1', 'elasticnet') # 'clf__nu': (0.5,0.6), #'clf__kernel': ('rbf', 'linear', 'poly'), # 'clf__tol': (1e-3, 1e-4, 1e-2, 1e-4) #'clf__n_estimators': (10, 50, 100, 500), #'clf__criterion': ('gini', 'entropy'), #'clf__max_features': ("auto", "log2", 100,) #'clf__alpha': (0, 1e-2, 1e-3, 1e-1, 1e-4, 1e-5), #'clf__fit_prior': (False, True), } # gs_clf = GridSearchCV(self.text_clf, parameters, n_jobs=-1, scoring=self.posfmeasure) # gs_clf = gs_clf.fit(self.features, self.labels) # print gs_clf.best_params_ logging.info("Traning with {}/{} true pairs".format(str(sum(self.labels)), str(len(self.labels)))) try: self.text_clf = self.text_clf.fit(self.features, self.labels) except ValueError: print "error training {}".format(self.modelname) return if not os.path.exists(self.basedir + self.modelname): os.makedirs(self.basedir + self.modelname) logging.info("Training complete, saving to {}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname)) joblib.dump(self.text_clf, "{}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname)) ch2 = SelectKBest(chi2, k=20) half_point = int(len(self.features)*0.5) X_train = self.text_clf.named_steps["vect"].fit_transform(self.features[:half_point]) X_test = self.text_clf.named_steps["vect"].transform(self.features[half_point:]) X_train = ch2.fit_transform(X_train, self.labels[:half_point]) X_test = ch2.transform(X_test) feature_names = self.text_clf.named_steps["vect"].get_feature_names() feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print feature_names # joblib.dump(gs_clf.best_estimator_, "{}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname)) # self.test()
def train_sentence_classifier(self, pairtype): self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(7,20), min_df=0.2, max_df=0.5)), #('vect', CountVectorizer(analyzer='word', ngram_range=(1,5), stop_words="english", min_df=0.1)), # ('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(6,20))), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.SVC(kernel='rbf', C=10, verbose=True, tol=1e-5)) #('clf', RandomForestClassifier(n_estimators=10)) #('feature_selection', feature_selection.SelectFromModel(LinearSVC(penalty="l1"))), ('clf', MultinomialNB(alpha=0.1, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ]) f, labels, sids = self.get_features(pairtype) half_point = int(len(f)*0.5) self.train_sentences = sids[:half_point] """ch2 = SelectKBest(chi2, k=20) X_train = text_clf.named_steps["vect"].fit_transform(f[:half_point]) X_test = text_clf.named_steps["vect"].transform(f[half_point:]) X_train = ch2.fit_transform(X_train, labels[:half_point]) X_test = ch2.transform(X_test) feature_names = text_clf.named_steps["vect"].get_feature_names() feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] # print feature_names""" # train text_clf = self.text_clf.fit(f[:half_point], labels[:half_point]) #save model if not os.path.exists("models/kernel_models/" + pairtype + "_sentence_classifier/"): os.makedirs("models/kernel_models/" + pairtype + "_sentence_classifier/") logging.info("Training complete, saving to {}/{}/{}.pkl".format("models/kernel_models/", pairtype + "_sentence_classifier/", pairtype)) joblib.dump(text_clf, "{}/{}/{}.pkl".format("models/kernel_models/", pairtype + "_sentence_classifier/", pairtype)) # evaluate pred = text_clf.predict(f[half_point:]) # print len(pred), sum(pred) self.type_sentences[pairtype] = [] for ip, p in enumerate(pred): if p: self.type_sentences[pairtype].append(sids[half_point + ip]) res = metrics.confusion_matrix(labels[half_point:], pred) return res[1][1], res[0][1], res[1][0]