find_best_attributes_surrogate_svm.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. import joblib
  16. import sklearn.svm as svm
  17. from sklearn.utils import shuffle
  18. from sklearn.metrics import roc_auc_score
  19. from sklearn.model_selection import cross_val_score
  20. # modules and config imports
  21. sys.path.insert(0, '') # trick to enable import of main folder module
  22. import custom_config as cfg
  23. import models as mdl
  24. from optimization.ILSPopSurrogate import ILSPopSurrogate
  25. from macop.solutions.discrete import BinarySolution
  26. from macop.evaluators.base import Evaluator
  27. from macop.operators.discrete.mutators import SimpleMutation
  28. from macop.operators.discrete.mutators import SimpleBinaryMutation
  29. from macop.operators.discrete.crossovers import SimpleCrossover
  30. from macop.operators.discrete.crossovers import RandomSplitCrossover
  31. from optimization.operators.SimplePopCrossover import SimplePopCrossover, RandomPopCrossover
  32. from macop.policies.reinforcement import UCBPolicy
  33. from macop.callbacks.classicals import BasicCheckpoint
  34. from macop.callbacks.policies import UCBCheckpoint
  35. from optimization.callbacks.MultiPopCheckpoint import MultiPopCheckpoint
  36. from optimization.callbacks.SurrogateMonoCheckpoint import SurrogateMonoCheckpoint
  37. #from sklearn.ensemble import RandomForestClassifier
  38. # variables and parameters
  39. models_list = cfg.models_names_list
  40. from warnings import simplefilter
  41. simplefilter("ignore")
  42. # default validator
  43. def validator(solution):
  44. # at least 5 attributes and at most 16
  45. if list(solution.data).count(1) < 4 or list(solution.data).count(1) > 20:
  46. return False
  47. return True
  48. def loadDataset(filename):
  49. ########################
  50. # 1. Get and prepare data
  51. ########################
  52. # scene_name; zone_id; image_index_end; label; data
  53. dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
  54. dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
  55. # default first shuffle of data
  56. dataset_train = shuffle(dataset_train)
  57. dataset_test = shuffle(dataset_test)
  58. # get dataset with equal number of classes occurences
  59. noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
  60. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
  61. #nb_noisy_train = len(noisy_df_train.index)
  62. noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
  63. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
  64. #nb_noisy_test = len(noisy_df_test.index)
  65. # use of all data
  66. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  67. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  68. # shuffle data another time
  69. final_df_train = shuffle(final_df_train)
  70. final_df_test = shuffle(final_df_test)
  71. # use of the whole data set for training
  72. x_dataset_train = final_df_train.iloc[:, 4:]
  73. x_dataset_test = final_df_test.iloc[:, 4:]
  74. y_dataset_train = final_df_train.iloc[:, 3]
  75. y_dataset_test = final_df_test.iloc[:, 3]
  76. return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
  77. def _get_best_model(X_train, y_train):
  78. Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
  79. gammas = [0.001, 0.01, 0.1, 5, 10, 100]
  80. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  81. svc = svm.SVC(probability=True, class_weight='balanced')
  82. #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
  83. clf = GridSearchCV(svc, param_grid, cv=5, verbose=0, n_jobs=22)
  84. clf.fit(X_train, y_train)
  85. model = clf.best_estimator_
  86. return model
  87. def main():
  88. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  89. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  90. parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', required=True)
  91. parser.add_argument('--train_every', type=int, help='max number of evalution before retraining surrogare model', required=True)
  92. parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
  93. parser.add_argument('--pop', type=int, help='pop size', required=True)
  94. parser.add_argument('--order', type=int, help='walsh order function', required=True)
  95. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  96. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  97. parser.add_argument('--output', type=str, help='output surrogate model name')
  98. args = parser.parse_args()
  99. p_data_file = args.data
  100. p_length = args.length
  101. p_pop = args.pop
  102. p_order = args.order
  103. p_start = args.start_surrogate
  104. p_retrain = args.train_every
  105. p_ils_iteration = args.ils
  106. p_ls_iteration = args.ls
  107. p_output = args.output
  108. print(p_data_file)
  109. # load data from file
  110. x_train, y_train, x_test, y_test = loadDataset(p_data_file)
  111. # create `logs` folder if necessary
  112. if not os.path.exists(cfg.output_logs_folder):
  113. os.makedirs(cfg.output_logs_folder)
  114. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
  115. # init solution (`n` attributes)
  116. def init():
  117. return BinarySolution.random(p_length, validator)
  118. class ModelEvaluator(Evaluator):
  119. # define evaluate function here (need of data information)
  120. def compute(self, solution):
  121. print(f'Solution is composed of {list(solution.data).count(1)} attributes')
  122. start = datetime.datetime.now()
  123. # get indices of filters data to use (filters selection from solution)
  124. indices = []
  125. for index, value in enumerate(solution.data):
  126. if value == 1:
  127. indices.append(index)
  128. # keep only selected filters from solution
  129. x_train_filters = self._data['x_train'].iloc[:, indices]
  130. y_train_filters = self._data['y_train']
  131. x_test_filters = self._data['x_test'].iloc[:, indices]
  132. model = _get_best_model(x_train_filters, y_train_filters)
  133. # model = RandomForestClassifier(n_estimators=500, class_weight='balanced', bootstrap=True, max_samples=0.75, n_jobs=-1)
  134. # model = model.fit(x_train_filters, y_train_filters)
  135. y_test_model = model.predict(x_test_filters)
  136. test_roc_auc = roc_auc_score(self._data['y_test'], y_test_model)
  137. end = datetime.datetime.now()
  138. diff = end - start
  139. print('----')
  140. print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  141. return test_roc_auc
  142. # build all output folder and files based on `output` name
  143. backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
  144. surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
  145. surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
  146. if not os.path.exists(backup_model_folder):
  147. os.makedirs(backup_model_folder)
  148. if not os.path.exists(cfg.output_surrogates_model_folder):
  149. os.makedirs(cfg.output_surrogates_model_folder)
  150. if not os.path.exists(cfg.output_surrogates_data_folder):
  151. os.makedirs(cfg.output_surrogates_data_folder)
  152. backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
  153. ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
  154. surrogate_performanche_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_performance.csv')
  155. # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
  156. operators = [SimpleBinaryMutation(), SimpleMutation(), RandomPopCrossover(), SimplePopCrossover()]
  157. policy = UCBPolicy(operators, C=100, exp_rate=0.1)
  158. # define first line if necessary
  159. if not os.path.exists(surrogate_output_data):
  160. with open(surrogate_output_data, 'w') as f:
  161. f.write('x;y\n')
  162. # custom ILS for surrogate use
  163. algo = ILSPopSurrogate(initalizer=init,
  164. evaluator=ModelEvaluator(data={'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test}), # same evaluator by default, as we will use the surrogate function
  165. operators=operators,
  166. policy=policy,
  167. validator=validator,
  168. population_size=p_pop,
  169. surrogate_file_path=surrogate_output_model,
  170. start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
  171. solutions_file=surrogate_output_data,
  172. walsh_order=p_order,
  173. inter_policy_ls_file=os.path.join(backup_model_folder, p_output + '_ls_ucbPolicy.csv'),
  174. ls_train_surrogate=p_retrain,
  175. maximise=True)
  176. algo.addCallback(MultiPopCheckpoint(every=1, filepath=backup_file_path))
  177. algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
  178. algo.addCallback(SurrogateMonoCheckpoint(every=1, filepath=surrogate_performanche_file_path))
  179. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  180. # print best solution found
  181. print("Found ", bestSol)
  182. # save model information into .csv file
  183. if not os.path.exists(cfg.results_information_folder):
  184. os.makedirs(cfg.results_information_folder)
  185. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  186. filters_counter = 0
  187. # count number of filters
  188. for index, item in enumerate(bestSol.data):
  189. if index != 0 and index % 2 == 1:
  190. # if two attributes are used
  191. if item == 1 or bestSol.data[index - 1] == 1:
  192. filters_counter += 1
  193. line_info = p_output + ';' + p_data_file + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
  194. # check if results are already saved...
  195. already_saved = False
  196. if os.path.exists(filename_path):
  197. with open(filename_path, 'r') as f:
  198. lines = f.readlines()
  199. for line in lines:
  200. output_name = line.split(';')[0]
  201. if p_output == output_name:
  202. already_saved = True
  203. if not already_saved:
  204. with open(filename_path, 'a') as f:
  205. f.write(line_info + '\n')
  206. print('Result saved into %s' % filename_path)
  207. if __name__ == "__main__":
  208. main()