check_random_forest_perfomance.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. from sklearn.feature_selection import SelectFromModel
  16. import joblib
  17. import sklearn.svm as svm
  18. from sklearn.utils import shuffle
  19. from sklearn.metrics import roc_auc_score
  20. from sklearn.model_selection import cross_val_score
  21. # modules and config imports
  22. sys.path.insert(0, '') # trick to enable import of main folder module
  23. import custom_config as cfg
  24. import models as mdl
  25. #from sklearn.ensemble import RandomForestClassifier
  26. def loadDataset(filename):
  27. ########################
  28. # 1. Get and prepare data
  29. ########################
  30. # scene_name; zone_id; image_index_end; label; data
  31. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  32. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  33. # default first shuffle of data
  34. dataset_train = shuffle(dataset_train)
  35. dataset_test = shuffle(dataset_test)
  36. # get dataset with equal number of classes occurences
  37. noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
  38. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
  39. #nb_noisy_train = len(noisy_df_train.index)
  40. noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
  41. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
  42. #nb_noisy_test = len(noisy_df_test.index)
  43. # use of all data
  44. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  45. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  46. # shuffle data another time
  47. final_df_train = shuffle(final_df_train)
  48. final_df_test = shuffle(final_df_test)
  49. # use of the whole data set for training
  50. x_dataset_train = final_df_train.iloc[:, 4:]
  51. x_dataset_test = final_df_test.iloc[:, 4:]
  52. y_dataset_train = final_df_train.iloc[:, 3]
  53. y_dataset_test = final_df_test.iloc[:, 3]
  54. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  55. def train_predict_random_forest(x_train, y_train, x_test, y_test):
  56. print('Start training Random forest model')
  57. start = datetime.datetime.now()
  58. # model = _get_best_model(x_train_filters, y_train_filters)
  59. random_forest_model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  60. random_forest_model = random_forest_model.fit(x_train, y_train)
  61. y_test_model = random_forest_model.predict(x_test)
  62. test_roc_auc = roc_auc_score(y_test, y_test_model)
  63. end = datetime.datetime.now()
  64. diff = end - start
  65. print("Evaluation took: {}, AUC score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  66. return random_forest_model
  67. def train_predict_selector(model, x_train, y_train, x_test, y_test):
  68. start = datetime.datetime.now()
  69. print("Using Select from model with Random Forest")
  70. selector = SelectFromModel(estimator=model, prefit=True)
  71. x_train_transformed = selector.transform(x_train)
  72. x_test_transformed = selector.transform(x_test)
  73. print('Previous shape:', x_train.shape)
  74. print('New shape:', x_train_transformed.shape)
  75. # using specific features
  76. model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  77. model = model.fit(x_train_transformed, y_train)
  78. y_test_model= model.predict(x_test_transformed)
  79. test_roc_auc = roc_auc_score(y_test, y_test_model)
  80. end = datetime.datetime.now()
  81. diff = end - start
  82. print("Evaluation took: {}, AUC score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  83. def main():
  84. parser = argparse.ArgumentParser(description="Train and find using all data to use for model")
  85. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  86. parser.add_argument('--output', type=str, help='output surrogate model name')
  87. args = parser.parse_args()
  88. p_data_file = args.data
  89. p_output = args.output
  90. print(p_data_file)
  91. # load data from file
  92. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  93. # train classical random forest
  94. random_forest_model = train_predict_random_forest(x_train, y_train, x_test, y_test)
  95. # train using select from model
  96. train_predict_selector(random_forest_model, x_train, y_train, x_test, y_test)
  97. if __name__ == "__main__":
  98. main()