data = pd.read_csv('/kaggle/input/ml2021spring-hw1/covid.train.csv') x = data[data.columns[1:94]] #这里是利用columns函数获取1:94列或94列的索引 y = data[data.columns[94]]
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression
from sklearn import preprocessing x = (x - x.min()) / (x.max() - x.min())
bestfeatures = SelectKBest(score_func=f_regression, k=5) fit = bestfeatures.fit(x,y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x.columns) #concat two dataframes for better visualization featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Specs','Score'] #naming the dataframe columns print(featureScores.nlargest(15,'Score')) #print 15 best features
classSelectKBest(_BaseFilter): """Select features according to the k highest scores. Read more in the :ref:`User Guide <univariate_feature_selection>`. Parameters ---------- score_func : callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. Default is f_classif (see below "See also"). The default function only works with classification tasks. k : int or "all", optional, default=10 Number of top features to select. The "all" option bypasses selection, for use in a parameter search. Attributes ---------- scores_ : array-like, shape=(n_features,) Scores of features. pvalues_ : array-like, shape=(n_features,) p-values of feature scores, None if `score_func` returned only scores. Notes ----- Ties between features with equal scores will be broken in an unspecified way. See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continuous target. SelectPercentile: Select features based on percentile of the highest scores. SelectFpr: Select features based on a false positive rate test. SelectFdr: Select features based on an estimated false discovery rate. SelectFwe: Select features based on family-wise error rate. GenericUnivariateSelect: Univariate feature selector with configurable mode. """