find_best_attributes.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. # model imports
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import GridSearchCV
  12. from sklearn.linear_model import LogisticRegression
  13. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  14. import joblib
  15. import sklearn.svm as svm
  16. from sklearn.utils import shuffle
  17. from sklearn.metrics import roc_auc_score
  18. from sklearn.model_selection import cross_val_score
  19. # modules and config imports
  20. sys.path.insert(0, '') # trick to enable import of main folder module
  21. import custom_config as cfg
  22. import models as mdl
  23. from macop.algorithms.mono.IteratedLocalSearch import IteratedLocalSearch as ILS
  24. from macop.solutions.BinarySolution import BinarySolution
  25. from macop.operators.mutators.SimpleMutation import SimpleMutation
  26. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  27. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  28. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  29. from macop.operators.policies.UCBPolicy import UCBPolicy
  30. from macop.callbacks.BasicCheckpoint import BasicCheckpoint
  31. from macop.callbacks.UCBCheckpoint import UCBCheckpoint
  32. # variables and parameters
  33. models_list = cfg.models_names_list
  34. # default validator
  35. def validator(solution):
  36. if list(solution.data).count(1) < 5:
  37. return False
  38. return True
  39. def loadDataset(filename):
  40. ########################
  41. # 1. Get and prepare data
  42. ########################
  43. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  44. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  45. # default first shuffle of data
  46. dataset_train = shuffle(dataset_train)
  47. dataset_test = shuffle(dataset_test)
  48. # get dataset with equal number of classes occurences
  49. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  50. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  51. #nb_noisy_train = len(noisy_df_train.index)
  52. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  53. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  54. #nb_noisy_test = len(noisy_df_test.index)
  55. # use of all data
  56. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  57. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  58. # shuffle data another time
  59. final_df_train = shuffle(final_df_train)
  60. final_df_test = shuffle(final_df_test)
  61. # use of the whole data set for training
  62. x_dataset_train = final_df_train.iloc[:,1:]
  63. x_dataset_test = final_df_test.iloc[:,1:]
  64. y_dataset_train = final_df_train.iloc[:,0]
  65. y_dataset_test = final_df_test.iloc[:,0]
  66. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  67. def main():
  68. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  69. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  70. parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
  71. parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
  72. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  73. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  74. args = parser.parse_args()
  75. p_data_file = args.data
  76. p_choice = args.choice
  77. p_length = args.length
  78. p_ils_iteration = args.ils
  79. p_ls_iteration = args.ls
  80. print(p_data_file)
  81. # load data from file
  82. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  83. # create `logs` folder if necessary
  84. if not os.path.exists(cfg.output_logs_folder):
  85. os.makedirs(cfg.output_logs_folder)
  86. _, data_file_name = os.path.split(p_data_file)
  87. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
  88. # init solution (`n` attributes)
  89. def init():
  90. return BinarySolution([], p_length
  91. ).random(validator)
  92. # define evaluate function here (need of data information)
  93. def evaluate(solution):
  94. start = datetime.datetime.now()
  95. # get indices of filters data to use (filters selection from solution)
  96. indices = []
  97. for index, value in enumerate(solution.data):
  98. if value == 1:
  99. indices.append(index)
  100. # keep only selected filters from solution
  101. x_train_filters = x_train.iloc[:, indices]
  102. y_train_filters = y_train
  103. x_test_filters = x_test.iloc[:, indices]
  104. # TODO : use of GPU implementation of SVM
  105. model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  106. y_test_model = model.predict(x_test_filters)
  107. test_roc_auc = roc_auc_score(y_test, y_test_model)
  108. end = datetime.datetime.now()
  109. diff = end - start
  110. print("Evaluation took :", divmod(diff.days * 86400 + diff.seconds, 60))
  111. return test_roc_auc
  112. backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
  113. if not os.path.exists(backup_model_folder):
  114. os.makedirs(backup_model_folder)
  115. backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
  116. ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
  117. # prepare optimization algorithm
  118. operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
  119. policy = UCBPolicy(operators)
  120. algo = ILS(init, evaluate, operators, policy, validator, True)
  121. algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
  122. algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
  123. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  124. # print best solution found
  125. print("Found ", bestSol)
  126. # save model information into .csv file
  127. if not os.path.exists(cfg.results_information_folder):
  128. os.makedirs(cfg.results_information_folder)
  129. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  130. filters_counter = 0
  131. # count number of filters
  132. for index, item in enumerate(bestSol.data):
  133. if index != 0 and index % 2 == 1:
  134. # if two attributes are used
  135. if item == 1 or bestSol.data[index - 1] == 1:
  136. filters_counter += 1
  137. line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
  138. with open(filename_path, 'a') as f:
  139. f.write(line_info + '\n')
  140. print('Result saved into %s' % filename_path)
  141. if __name__ == "__main__":
  142. main()