check_random_forest_perfomance_rfe.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. from sklearn.feature_selection import SelectFromModel
  16. import joblib
  17. import sklearn.svm as svm
  18. from sklearn.utils import shuffle
  19. from sklearn.metrics import roc_auc_score
  20. from sklearn.model_selection import cross_val_score
  21. from sklearn.feature_selection import RFECV
  22. # modules and config imports
  23. sys.path.insert(0, '') # trick to enable import of main folder module
  24. import custom_config as cfg
  25. import models as mdl
  26. #from sklearn.ensemble import RandomForestClassifier
  27. def loadDataset(filename):
  28. ########################
  29. # 1. Get and prepare data
  30. ########################
  31. # scene_name; zone_id; image_index_end; label; data
  32. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  33. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  34. # default first shuffle of data
  35. dataset_train = shuffle(dataset_train)
  36. dataset_test = shuffle(dataset_test)
  37. # get dataset with equal number of classes occurences
  38. noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
  39. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
  40. #nb_noisy_train = len(noisy_df_train.index)
  41. noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
  42. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
  43. #nb_noisy_test = len(noisy_df_test.index)
  44. # use of all data
  45. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  46. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  47. # shuffle data another time
  48. final_df_train = shuffle(final_df_train)
  49. final_df_test = shuffle(final_df_test)
  50. # use of the whole data set for training
  51. x_dataset_train = final_df_train.iloc[:, 4:]
  52. x_dataset_test = final_df_test.iloc[:, 4:]
  53. y_dataset_train = final_df_train.iloc[:, 3]
  54. y_dataset_test = final_df_test.iloc[:, 3]
  55. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  56. def train_predict_random_forest(x_train, y_train, x_test, y_test):
  57. print('Start training Random forest model')
  58. start = datetime.datetime.now()
  59. # model = _get_best_model(x_train_filters, y_train_filters)
  60. random_forest_model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  61. # No need to learn
  62. random_forest_model = random_forest_model.fit(x_train, y_train)
  63. y_test_model = random_forest_model.predict(x_test)
  64. test_roc_auc = roc_auc_score(y_test, y_test_model)
  65. end = datetime.datetime.now()
  66. diff = end - start
  67. print("Evaluation took: {}, AUC score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  68. return random_forest_model
  69. def train_predict_selector(model, x_train, y_train, x_test, y_test):
  70. start = datetime.datetime.now()
  71. print("Using Select from model with Random Forest")
  72. selector = RFECV(estimator=model, min_features_to_select=13, verbose=1, n_jobs=4)
  73. selector.fit(x_train, y_train)
  74. x_train_transformed = selector.transform(x_train)
  75. x_test_transformed = selector.transform(x_test)
  76. print('Previous shape:', x_train.shape)
  77. print('New shape:', x_train_transformed.shape)
  78. # using specific features
  79. model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  80. model = model.fit(x_train_transformed, y_train)
  81. y_test_model= model.predict(x_test_transformed)
  82. test_roc_auc = roc_auc_score(y_test, y_test_model)
  83. end = datetime.datetime.now()
  84. diff = end - start
  85. print("Evaluation took: {}, AUC score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  86. def main():
  87. parser = argparse.ArgumentParser(description="Train and find using all data to use for model")
  88. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  89. parser.add_argument('--output', type=str, help='output surrogate model name')
  90. args = parser.parse_args()
  91. p_data_file = args.data
  92. p_output = args.output
  93. print(p_data_file)
  94. # load data from file
  95. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  96. # train classical random forest
  97. random_forest_model = train_predict_random_forest(x_train, y_train, x_test, y_test)
  98. # train using select from model
  99. train_predict_selector(random_forest_model, x_train, y_train, x_test, y_test)
  100. if __name__ == "__main__":
  101. main()