find_best_filters.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. # model imports
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.model_selection import GridSearchCV
  11. from sklearn.linear_model import LogisticRegression
  12. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  13. import sklearn.svm as svm
  14. from sklearn.utils import shuffle
  15. from sklearn.externals import joblib
  16. from sklearn.metrics import roc_auc_score
  17. from sklearn.model_selection import cross_val_score
  18. # modules and config imports
  19. sys.path.insert(0, '') # trick to enable import of main folder module
  20. import custom_config as cfg
  21. import models as mdl
  22. from macop.algorithms.mono.IteratedLocalSearch import IteratedLocalSearch as ILS
  23. from macop.solutions.BinarySolution import BinarySolution
  24. from macop.operators.mutators.SimpleMutation import SimpleMutation
  25. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  26. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  27. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  28. from macop.operators.policies.UCBPolicy import UCBPolicy
  29. from macop.callbacks.BasicCheckpoint import BasicCheckpoint
  30. from macop.callbacks.UCBCheckpoint import UCBCheckpoint
  31. # variables and parameters
  32. models_list = cfg.models_names_list
  33. number_of_values = 26
  34. ils_iteration = 10000
  35. ls_iteration = 20
  36. # default validator
  37. def validator(solution):
  38. if list(solution.data).count(1) < 5:
  39. return False
  40. return True
  41. # init solution (13 filters)
  42. def init():
  43. return BinarySolution([], 13).random(validator)
  44. def loadDataset(filename):
  45. ########################
  46. # 1. Get and prepare data
  47. ########################
  48. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  49. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  50. # default first shuffle of data
  51. dataset_train = shuffle(dataset_train)
  52. dataset_test = shuffle(dataset_test)
  53. # get dataset with equal number of classes occurences
  54. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  55. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  56. #nb_noisy_train = len(noisy_df_train.index)
  57. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  58. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  59. #nb_noisy_test = len(noisy_df_test.index)
  60. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  61. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  62. # shuffle data another time
  63. final_df_train = shuffle(final_df_train)
  64. final_df_test = shuffle(final_df_test)
  65. # use of the whole data set for training
  66. x_dataset_train = final_df_train.iloc[:,1:]
  67. x_dataset_test = final_df_test.iloc[:,1:]
  68. y_dataset_train = final_df_train.iloc[:,0]
  69. y_dataset_test = final_df_test.iloc[:,0]
  70. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  71. def main():
  72. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  73. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
  74. parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list)
  75. args = parser.parse_args()
  76. p_data_file = args.data
  77. p_choice = args.choice
  78. # load data from file
  79. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  80. # create `logs` folder if necessary
  81. if not os.path.exists(cfg.logs_folder):
  82. os.makedirs(cfg.logs_folder)
  83. logging.basicConfig(format='%(asctime)s %(message)s', filename='logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
  84. # define evaluate function here (need of data information)
  85. def evaluate(solution):
  86. # get indices of filters data to use (filters selection from solution)
  87. indices = []
  88. for index, value in enumerate(solution.data):
  89. if value == 1:
  90. indices.append(index*2)
  91. indices.append(index*2+1)
  92. # keep only selected filters from solution
  93. x_train_filters = x_train.iloc[:, indices]
  94. y_train_filters = y_train
  95. x_test_filters = x_test.iloc[:, indices]
  96. model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  97. y_test_model = model.predict(x_test_filters)
  98. test_roc_auc = roc_auc_score(y_test, y_test_model)
  99. return test_roc_auc
  100. if not os.path.exists(cfg.output_backup_folder):
  101. os.makedirs(cfg.output_backup_folder)
  102. backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '.csv')
  103. ucb_backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '_ucbPolicy.csv')
  104. # prepare optimization algorithm
  105. operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
  106. policy = UCBPolicy(operators)
  107. algo = ILS(init, evaluate, operators, policy, validator, True)
  108. algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
  109. algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
  110. bestSol = algo.run(ils_iteration, ls_iteration)
  111. # print best solution found
  112. print("Found ", bestSol)
  113. # save model information into .csv file
  114. if not os.path.exists(cfg.results_information_folder):
  115. os.makedirs(cfg.results_information_folder)
  116. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_filters_result_filename)
  117. line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
  118. with open(filename_path, 'a') as f:
  119. f.write(line_info + '\n')
  120. print('Result saved into %s' % filename_path)
  121. if __name__ == "__main__":
  122. main()