find_best_attributes.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. # model imports
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.model_selection import GridSearchCV
  11. from sklearn.linear_model import LogisticRegression
  12. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  13. import sklearn.svm as svm
  14. from sklearn.utils import shuffle
  15. from sklearn.externals import joblib
  16. from sklearn.metrics import roc_auc_score
  17. from sklearn.model_selection import cross_val_score
  18. # modules and config imports
  19. sys.path.insert(0, '') # trick to enable import of main folder module
  20. import custom_config as cfg
  21. import models as mdl
  22. from optimization.algorithms.IteratedLocalSearch import IteratedLocalSearch as ILS
  23. from optimization.solutions.BinarySolution import BinarySolution
  24. from optimization.updators.mutators.SimpleMutation import SimpleMutation, SimpleBinaryMutation
  25. from optimization.updators.policies.RandomPolicy import RandomPolicy
  26. # variables and parameters
  27. models_list = cfg.models_names_list
  28. number_of_values = 26
  29. # default validator
  30. def validator(solution):
  31. if list(solution.data).count(1) < 5:
  32. return False
  33. return True
  34. # init solution (13 filters)
  35. def init():
  36. return BinarySolution([], 13).random(validator)
  37. def loadDataset(filename):
  38. ########################
  39. # 1. Get and prepare data
  40. ########################
  41. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  42. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  43. # default first shuffle of data
  44. dataset_train = shuffle(dataset_train)
  45. dataset_test = shuffle(dataset_test)
  46. # get dataset with equal number of classes occurences
  47. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  48. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  49. nb_noisy_train = len(noisy_df_train.index)
  50. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  51. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  52. nb_noisy_test = len(noisy_df_test.index)
  53. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  54. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  55. # shuffle data another time
  56. final_df_train = shuffle(final_df_train)
  57. final_df_test = shuffle(final_df_test)
  58. # use of the whole data set for training
  59. x_dataset_train = final_df_train.iloc[:,1:]
  60. x_dataset_test = final_df_test.iloc[:,1:]
  61. y_dataset_train = final_df_train.iloc[:,0]
  62. y_dataset_test = final_df_test.iloc[:,0]
  63. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  64. def main():
  65. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  66. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
  67. parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list)
  68. args = parser.parse_args()
  69. p_data_file = args.data
  70. p_choice = args.choice
  71. # load data from file
  72. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  73. # create `logs` folder if necessary
  74. if not os.path.exists(cfg.logs_folder):
  75. os.makedirs(cfg.logs_folder)
  76. logging.basicConfig(format='%(asctime)s %(message)s', filename='logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
  77. # define evaluate function here (need of data information)
  78. def evaluate(solution):
  79. # get indices of filters data to use (filters selection from solution)
  80. indices = []
  81. for index, value in enumerate(solution.data):
  82. if value == 1:
  83. indices.append(index*2)
  84. indices.append(index*2+1)
  85. # keep only selected filters from solution
  86. x_train_filters = x_train.iloc[:, indices]
  87. y_train_filters = y_train
  88. x_test_filters = x_test.iloc[:, indices]
  89. model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  90. y_test_model = model.predict(x_test_filters)
  91. test_roc_auc = roc_auc_score(y_test, y_test_model)
  92. return test_roc_auc
  93. # prepare optimization algorithm
  94. updators = [SimpleBinaryMutation, SimpleMutation]
  95. policy = RandomPolicy(updators)
  96. algo = ILS(init, evaluate, updators, policy, validator, True)
  97. bestSol = algo.run(100, 10)
  98. # print best solution found
  99. print("Found ", bestSol)
  100. if __name__ == "__main__":
  101. main()