find_best_attributes_surrogate_svm.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. import math
  11. # model imports
  12. from sklearn.model_selection import train_test_split
  13. from sklearn.model_selection import GridSearchCV
  14. from sklearn.linear_model import LogisticRegression
  15. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  16. import joblib
  17. import sklearn.svm as svm
  18. from sklearn.utils import shuffle
  19. from sklearn.metrics import roc_auc_score
  20. from sklearn.model_selection import cross_val_score
  21. # modules and config imports
  22. sys.path.insert(0, '') # trick to enable import of main folder module
  23. import custom_config as cfg
  24. import models as mdl
  25. from optimization.ILSPopSurrogate import ILSPopSurrogate
  26. from macop.solutions.discrete import BinarySolution
  27. from macop.evaluators.base import Evaluator
  28. from macop.operators.discrete.mutators import SimpleMutation
  29. from macop.operators.discrete.mutators import SimpleBinaryMutation
  30. from macop.operators.discrete.crossovers import SimpleCrossover
  31. from macop.operators.discrete.crossovers import RandomSplitCrossover
  32. from optimization.operators.SimplePopCrossover import SimplePopCrossover, RandomPopCrossover
  33. from macop.policies.reinforcement import UCBPolicy
  34. from macop.callbacks.classicals import BasicCheckpoint
  35. from macop.callbacks.policies import UCBCheckpoint
  36. from optimization.callbacks.MultiPopCheckpoint import MultiPopCheckpoint
  37. from optimization.callbacks.SurrogateMonoCheckpoint import SurrogateMonoCheckpoint
  38. #from sklearn.ensemble import RandomForestClassifier
  39. # variables and parameters
  40. models_list = cfg.models_names_list
  41. from warnings import simplefilter
  42. simplefilter("ignore")
  43. # default validator
  44. def validator(solution):
  45. # at least 5 attributes and at most 16
  46. if list(solution.data).count(1) < 4 or list(solution.data).count(1) > 20:
  47. return False
  48. return True
  49. def loadDataset(filename):
  50. ########################
  51. # 1. Get and prepare data
  52. ########################
  53. # scene_name; zone_id; image_index_end; label; data
  54. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  55. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  56. # default first shuffle of data
  57. dataset_train = shuffle(dataset_train)
  58. dataset_test = shuffle(dataset_test)
  59. # get dataset with equal number of classes occurences
  60. noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
  61. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
  62. #nb_noisy_train = len(noisy_df_train.index)
  63. noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
  64. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
  65. #nb_noisy_test = len(noisy_df_test.index)
  66. # use of all data
  67. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  68. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  69. # shuffle data another time
  70. final_df_train = shuffle(final_df_train)
  71. final_df_test = shuffle(final_df_test)
  72. # use of the whole data set for training
  73. x_dataset_train = final_df_train.iloc[:, 4:]
  74. x_dataset_test = final_df_test.iloc[:, 4:]
  75. y_dataset_train = final_df_train.iloc[:, 3]
  76. y_dataset_test = final_df_test.iloc[:, 3]
  77. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  78. def _get_best_model(X_train, y_train):
  79. Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
  80. gammas = [0.001, 0.01, 0.1, 5, 10, 100]
  81. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  82. svc = svm.SVC(probability=True, class_weight='balanced')
  83. #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
  84. clf = GridSearchCV(svc, param_grid, cv=5, verbose=0, n_jobs=22)
  85. clf.fit(X_train, y_train)
  86. model = clf.best_estimator_
  87. return model
  88. def main():
  89. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  90. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  91. parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', required=True)
  92. parser.add_argument('--train_every', type=int, help='max number of evalution before retraining surrogare model', required=True)
  93. parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
  94. parser.add_argument('--pop', type=int, help='pop size', required=True)
  95. parser.add_argument('--order', type=int, help='walsh order function', required=True)
  96. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  97. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  98. parser.add_argument('--output', type=str, help='output surrogate model name')
  99. args = parser.parse_args()
  100. p_data_file = args.data
  101. p_length = args.length
  102. p_pop = args.pop
  103. p_order = args.order
  104. p_start = args.start_surrogate
  105. p_retrain = args.train_every
  106. p_ils_iteration = args.ils
  107. p_ls_iteration = args.ls
  108. p_output = args.output
  109. print(p_data_file)
  110. # load data from file
  111. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  112. # create `logs` folder if necessary
  113. if not os.path.exists(cfg.output_logs_folder):
  114. os.makedirs(cfg.output_logs_folder)
  115. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
  116. # init solution (`n` attributes)
  117. def init():
  118. return BinarySolution.random(p_length, validator)
  119. class ModelEvaluator(Evaluator):
  120. # define evaluate function here (need of data information)
  121. def compute(self, solution):
  122. print(f'Solution is composed of {list(solution.data).count(1)} attributes')
  123. start = datetime.datetime.now()
  124. # get indices of filters data to use (filters selection from solution)
  125. indices = []
  126. for index, value in enumerate(solution.data):
  127. if value == 1:
  128. indices.append(index)
  129. # keep only selected filters from solution
  130. x_train_filters = self._data['x_train'].iloc[:, indices]
  131. y_train_filters = self._data['y_train']
  132. x_test_filters = self._data['x_test'].iloc[:, indices]
  133. model = _get_best_model(x_train_filters, y_train_filters)
  134. # model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  135. # model = model.fit(x_train_filters, y_train_filters)
  136. y_test_model = model.predict(x_test_filters)
  137. y_train_model = model.predict(x_train_filters)
  138. test_roc_auc = roc_auc_score(self._data['y_test'], y_test_model)
  139. train_roc_auc = roc_auc_score(y_train_filters, y_train_model)
  140. end = datetime.datetime.now()
  141. diff = end - start
  142. print('----')
  143. print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  144. return test_roc_auc * (1 - math.pow(test_roc_auc - train_roc_auc, 2))
  145. # build all output folder and files based on `output` name
  146. backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
  147. surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
  148. surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
  149. if not os.path.exists(backup_model_folder):
  150. os.makedirs(backup_model_folder)
  151. if not os.path.exists(cfg.output_surrogates_model_folder):
  152. os.makedirs(cfg.output_surrogates_model_folder)
  153. if not os.path.exists(cfg.output_surrogates_data_folder):
  154. os.makedirs(cfg.output_surrogates_data_folder)
  155. backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
  156. ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
  157. surrogate_performanche_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_performance.csv')
  158. # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
  159. operators = [SimpleBinaryMutation(), SimpleMutation(), RandomPopCrossover(), SimplePopCrossover()]
  160. policy = UCBPolicy(operators, C=100, exp_rate=0.1)
  161. # define first line if necessary
  162. if not os.path.exists(surrogate_output_data):
  163. with open(surrogate_output_data, 'w') as f:
  164. f.write('x;y\n')
  165. # custom ILS for surrogate use
  166. algo = ILSPopSurrogate(initalizer=init,
  167. evaluator=ModelEvaluator(data={'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test}), # same evaluator by default, as we will use the surrogate function
  168. operators=operators,
  169. policy=policy,
  170. validator=validator,
  171. population_size=p_pop,
  172. surrogate_file_path=surrogate_output_model,
  173. start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
  174. solutions_file=surrogate_output_data,
  175. walsh_order=p_order,
  176. inter_policy_ls_file=os.path.join(backup_model_folder, p_output + '_ls_ucbPolicy.csv'),
  177. ls_train_surrogate=p_retrain,
  178. maximise=True)
  179. algo.addCallback(MultiPopCheckpoint(every=1, filepath=backup_file_path))
  180. algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
  181. algo.addCallback(SurrogateMonoCheckpoint(every=1, filepath=surrogate_performanche_file_path))
  182. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  183. # print best solution found
  184. print("Found ", bestSol)
  185. # save model information into .csv file
  186. if not os.path.exists(cfg.results_information_folder):
  187. os.makedirs(cfg.results_information_folder)
  188. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  189. filters_counter = 0
  190. # count number of filters
  191. for index, item in enumerate(bestSol.data):
  192. if index != 0 and index % 2 == 1:
  193. # if two attributes are used
  194. if item == 1 or bestSol.data[index - 1] == 1:
  195. filters_counter += 1
  196. line_info = p_output + ';' + p_data_file + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
  197. # check if results are already saved...
  198. already_saved = False
  199. if os.path.exists(filename_path):
  200. with open(filename_path, 'r') as f:
  201. lines = f.readlines()
  202. for line in lines:
  203. output_name = line.split(';')[0]
  204. if p_output == output_name:
  205. already_saved = True
  206. if not already_saved:
  207. with open(filename_path, 'a') as f:
  208. f.write(line_info + '\n')
  209. print('Result saved into %s' % filename_path)
  210. if __name__ == "__main__":
  211. main()