12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- from sklearn.feature_selection import VarianceThreshold
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import chi2
- from sklearn.svm import LinearSVC
- from sklearn.feature_selection import SelectFromModel
- from sklearn.svm import SVC
- from sklearn.model_selection import StratifiedKFold
- from sklearn.feature_selection import RFECV
- from sklearn.ensemble import ExtraTreesClassifier
- features_selection_list = [
- "variance_threshold",
- "kbest",
- "linearSVC",
- "tree",
- "rfecv"
- ]
- def features_selection_method(name, params, X_train, y_train, problem_size):
- indices = []
- if name == "variance_threshold":
- percent_to_keep = float(params)
- #sel = VarianceThreshold(threshold=(percent_to_keep * (1 - percent_to_keep)))
- sel = VarianceThreshold(threshold=(percent_to_keep))
- sel.fit_transform(X_train)
- indices = sel.get_support(indices=True)
- if name == "kbest":
- k_param = int(float(params) * problem_size) # here it's a percent over the whole dataset
- model = SelectKBest(chi2, k=k_param).fit_transform(X_train, y_train)
- indices = model.get_support(indices=True)
- if name == "linearSVC":
- C_param = float(params)
- lsvc = LinearSVC(C=C_param, penalty="l1", dual=False).fit(X_train, y_train)
- model = SelectFromModel(lsvc, prefit=True)
- indices = model.get_support(indices=True)
- if name == "tree":
- n_estimarors_param = int(params)
- clf = ExtraTreesClassifier(n_estimators=n_estimarors_param)
- clf = clf.fit(X_train, y_train)
- model = SelectFromModel(clf, prefit=True)
- indices = model.get_support(indices=True)
- if name == "rfecv":
- cv_param = int(params)
- # Create the RFE object and compute a cross-validated score
- svc = SVC(kernel="linear")
- # The "accuracy" scoring is proportional to the number of correct
- # classifications
- rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(cv_param),
- scoring='roc_auc')
- rfecv.fit(X_train, y_train)
- indices = rfecv.get_support(indices=True)
- return indices
|