methods.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. from sklearn.feature_selection import VarianceThreshold
  2. from sklearn.feature_selection import SelectKBest
  3. from sklearn.feature_selection import chi2
  4. from sklearn.svm import LinearSVC
  5. from sklearn.feature_selection import SelectFromModel
  6. from sklearn.svm import SVC
  7. from sklearn.model_selection import StratifiedKFold
  8. from sklearn.feature_selection import RFECV
  9. from sklearn.ensemble import ExtraTreesClassifier
  10. features_selection_list = [
  11. "variance_threshold",
  12. "kbest",
  13. "linearSVC",
  14. "tree",
  15. "rfecv"
  16. ]
  17. def features_selection_method(name, params, X_train, y_train, problem_size):
  18. indices = []
  19. if name == "variance_threshold":
  20. percent_to_keep = float(params)
  21. #sel = VarianceThreshold(threshold=(percent_to_keep * (1 - percent_to_keep)))
  22. sel = VarianceThreshold(threshold=(percent_to_keep))
  23. sel.fit_transform(X_train)
  24. indices = sel.get_support(indices=True)
  25. if name == "kbest":
  26. k_param = int(float(params) * problem_size) # here it's a percent over the whole dataset
  27. model = SelectKBest(chi2, k=k_param).fit_transform(X_train, y_train)
  28. indices = model.get_support(indices=True)
  29. if name == "linearSVC":
  30. C_param = float(params)
  31. lsvc = LinearSVC(C=C_param, penalty="l1", dual=False).fit(X_train, y_train)
  32. model = SelectFromModel(lsvc, prefit=True)
  33. indices = model.get_support(indices=True)
  34. if name == "tree":
  35. n_estimarors_param = int(params)
  36. clf = ExtraTreesClassifier(n_estimators=n_estimarors_param)
  37. clf = clf.fit(X_train, y_train)
  38. model = SelectFromModel(clf, prefit=True)
  39. indices = model.get_support(indices=True)
  40. if name == "rfecv":
  41. cv_param = int(params)
  42. # Create the RFE object and compute a cross-validated score
  43. svc = SVC(kernel="linear")
  44. # The "accuracy" scoring is proportional to the number of correct
  45. # classifications
  46. rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(cv_param),
  47. scoring='roc_auc')
  48. rfecv.fit(X_train, y_train)
  49. indices = rfecv.get_support(indices=True)
  50. return indices