find_best_attributes.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. # model imports
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import GridSearchCV
  12. from sklearn.linear_model import LogisticRegression
  13. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  14. import joblib
  15. import sklearn.svm as svm
  16. from sklearn.utils import shuffle
  17. from sklearn.metrics import roc_auc_score
  18. from sklearn.model_selection import cross_val_score
  19. # modules and config imports
  20. sys.path.insert(0, '') # trick to enable import of main folder module
  21. import custom_config as cfg
  22. import models as mdl
  23. from macop.algorithms.IteratedLocalSearch import IteratedLocalSearch as ILS
  24. from macop.solutions.BinarySolution import BinarySolution
  25. from macop.operators.mutators.SimpleMutation import SimpleMutation
  26. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  27. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  28. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  29. from macop.operators.policies.UCBPolicy import UCBPolicy
  30. from macop.checkpoints.BasicCheckpoint import BasicCheckpoint
  31. from macop.checkpoints.UCBCheckpoint import UCBCheckpoint
  32. # variables and parameters
  33. models_list = cfg.models_names_list
  34. number_of_values = 26
  35. ils_iteration = 2000
  36. ls_iteration = 10
  37. # default validator
  38. def validator(solution):
  39. if list(solution.data).count(1) < 5:
  40. return False
  41. return True
  42. def loadDataset(filename):
  43. ########################
  44. # 1. Get and prepare data
  45. ########################
  46. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  47. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  48. # default first shuffle of data
  49. dataset_train = shuffle(dataset_train)
  50. dataset_test = shuffle(dataset_test)
  51. # get dataset with equal number of classes occurences
  52. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  53. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  54. #nb_noisy_train = len(noisy_df_train.index)
  55. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  56. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  57. #nb_noisy_test = len(noisy_df_test.index)
  58. # use of all data
  59. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  60. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  61. # shuffle data another time
  62. final_df_train = shuffle(final_df_train)
  63. final_df_test = shuffle(final_df_test)
  64. # use of the whole data set for training
  65. x_dataset_train = final_df_train.iloc[:,1:]
  66. x_dataset_test = final_df_test.iloc[:,1:]
  67. y_dataset_train = final_df_train.iloc[:,0]
  68. y_dataset_test = final_df_test.iloc[:,0]
  69. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  70. def main():
  71. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  72. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  73. parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
  74. parser.add_argument('--length', type=str, help='max data length (need to be specify for evaluator)', required=True)
  75. args = parser.parse_args()
  76. p_data_file = args.data
  77. p_choice = args.choice
  78. p_length = args.length
  79. global number_of_values
  80. number_of_values = p_length
  81. print(p_data_file)
  82. # load data from file
  83. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  84. # create `logs` folder if necessary
  85. if not os.path.exists(cfg.output_logs_folder):
  86. os.makedirs(cfg.output_logs_folder)
  87. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
  88. # init solution (`n` attributes)
  89. def init():
  90. return BinarySolution([], number_of_values).random(validator)
  91. # define evaluate function here (need of data information)
  92. def evaluate(solution):
  93. start = datetime.datetime.now()
  94. # get indices of filters data to use (filters selection from solution)
  95. indices = []
  96. for index, value in enumerate(solution.data):
  97. if value == 1:
  98. indices.append(index)
  99. # keep only selected filters from solution
  100. x_train_filters = x_train.iloc[:, indices]
  101. y_train_filters = y_train
  102. x_test_filters = x_test.iloc[:, indices]
  103. # TODO : use of GPU implementation of SVM
  104. model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  105. y_test_model = model.predict(x_test_filters)
  106. test_roc_auc = roc_auc_score(y_test, y_test_model)
  107. end = datetime.datetime.now()
  108. diff = end - start
  109. print("Evaluation took :", divmod(diff.days * 86400 + diff.seconds, 60))
  110. return test_roc_auc
  111. if not os.path.exists(cfg.output_backup_folder):
  112. os.makedirs(cfg.output_backup_folder)
  113. backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '.csv')
  114. ucb_backup_file_path = os.path.join(cfg.backup_folder, p_data_file.split('/')[-1] + '_ucbPolicy.csv')
  115. # prepare optimization algorithm
  116. operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
  117. policy = UCBPolicy(updators)
  118. algo = ILS(init, evaluate, updators, policy, validator, True)
  119. algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
  120. algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
  121. bestSol = algo.run(ils_iteration, ls_iteration)
  122. # print best solution found
  123. print("Found ", bestSol)
  124. # save model information into .csv file
  125. if not os.path.exists(cfg.results_information_folder):
  126. os.makedirs(cfg.results_information_folder)
  127. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  128. filters_counter = 0
  129. # count number of filters
  130. for index, item in enumerate(bestSol.data):
  131. if index != 0 and index % 2 == 1:
  132. # if two attributes are used
  133. if item == 1 or bestSol.data[index - 1] == 1:
  134. filters_counter += 1
  135. line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
  136. with open(filename_path, 'a') as f:
  137. f.write(line_info + '\n')
  138. print('Result saved into %s' % filename_path)
  139. if __name__ == "__main__":
  140. main()