find_best_attributes_surrogate_openML_multi_specific.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. import joblib
  16. import sklearn
  17. import sklearn.svm as svm
  18. from sklearn.utils import shuffle
  19. from sklearn.metrics import roc_auc_score
  20. from sklearn.model_selection import cross_val_score
  21. from sklearn.preprocessing import MinMaxScaler
  22. # modules and config imports
  23. sys.path.insert(0, '') # trick to enable import of main folder module
  24. import custom_config as cfg
  25. import models as mdl
  26. from optimization.ILSMultiSpecificSurrogate import ILSMultiSpecificSurrogate
  27. from macop.solutions.BinarySolution import BinarySolution
  28. from macop.operators.mutators.SimpleMutation import SimpleMutation
  29. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  30. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  31. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  32. from macop.operators.policies.UCBPolicy import UCBPolicy
  33. from macop.operators.policies.RandomPolicy import RandomPolicy
  34. from macop.callbacks.BasicCheckpoint import BasicCheckpoint
  35. from macop.callbacks.UCBCheckpoint import UCBCheckpoint
  36. from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
  37. from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
  38. from optimization.callbacks.MultiSurrogateSpecificCheckpoint import MultiSurrogateSpecificCheckpoint
  39. from sklearn.ensemble import RandomForestClassifier
  40. # avoid display of warning
  41. def warn(*args, **kwargs):
  42. pass
  43. import warnings
  44. warnings.filterwarnings("ignore", category=DeprecationWarning)
  45. warnings.warn = warn
  46. # default validator
  47. def validator(solution):
  48. # at least 5 attributes
  49. if list(solution._data).count(1) < 2:
  50. return False
  51. return True
  52. def train_model(X_train, y_train):
  53. #print ('Creating model...')
  54. # here use of SVM with grid search CV
  55. Cs = [0.001, 0.01, 0.1, 1, 10, 100]
  56. gammas = [0.001, 0.01, 0.1,10, 100]
  57. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  58. svc = svm.SVC(probability=True, class_weight='balanced')
  59. #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
  60. clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
  61. clf.fit(X_train, y_train)
  62. model = clf.best_estimator_
  63. return model
  64. def loadDataset(filename):
  65. ########################
  66. # 1. Get and prepare data
  67. ########################
  68. dataset = pd.read_csv(filename, sep=',')
  69. # change label as common
  70. min_label_value = min(dataset.iloc[:, -1])
  71. max_label_value = max(dataset.iloc[:, -1])
  72. dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
  73. dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
  74. X_dataset = dataset.iloc[:, :-1]
  75. y_dataset = dataset.iloc[:, -1]
  76. problem_size = len(X_dataset.columns)
  77. # min/max normalisation over feature
  78. # create a scaler object
  79. scaler = MinMaxScaler()
  80. # fit and transform the data
  81. X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
  82. # prepare train, validation and test datasets
  83. X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
  84. return X_train, y_train, X_test, y_test, problem_size
  85. def main():
  86. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  87. parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
  88. parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
  89. parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
  90. parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
  91. parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
  92. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  93. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  94. parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
  95. parser.add_argument('--output', type=str, help='output surrogate model name')
  96. args = parser.parse_args()
  97. p_data_file = args.data
  98. p_every_ls = args.every_ls
  99. p_k_division = args.k_division
  100. p_k_dynamic = bool(args.k_dynamic)
  101. p_k_random = bool(args.k_random)
  102. p_ils_iteration = args.ils
  103. p_ls_iteration = args.ls
  104. p_generate_only = bool(args.generate_only)
  105. p_output = args.output
  106. # load data from file and get problem size
  107. X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
  108. # create `logs` folder if necessary
  109. if not os.path.exists(cfg.output_logs_folder):
  110. os.makedirs(cfg.output_logs_folder)
  111. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
  112. # init solution (`n` attributes)
  113. def init():
  114. return BinarySolution([], problem_size).random(validator)
  115. # define evaluate function here (need of data information)
  116. def evaluate(solution):
  117. start = datetime.datetime.now()
  118. # get indices of filters data to use (filters selection from solution)
  119. indices = []
  120. for index, value in enumerate(solution._data):
  121. if value == 1:
  122. indices.append(index)
  123. print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
  124. # keep only selected filters from solution
  125. x_train_filters = X_train[:, indices]
  126. x_test_filters = X_test[ :, indices]
  127. # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  128. model = train_model(x_train_filters, y_train)
  129. y_test_model = model.predict(x_test_filters)
  130. y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
  131. test_roc_auc = roc_auc_score(y_test, y_test_predict)
  132. end = datetime.datetime.now()
  133. diff = end - start
  134. print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  135. return test_roc_auc
  136. def sub_evaluate(solution, index_number, targeted_indices):
  137. start = datetime.datetime.now()
  138. # get indices of filters data to use (filters selection from solution)
  139. indices = []
  140. for index, value in enumerate(solution._data):
  141. if value == 1:
  142. indices.append(targeted_indices[index])
  143. print(f'Training sub-model SVM n°{index_number} with {len(indices)} from {len(solution._data)} available features')
  144. # keep only selected filters from solution
  145. x_train_filters = X_train[:, indices]
  146. x_test_filters = X_test[ :, indices]
  147. # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  148. model = train_model(x_train_filters, y_train)
  149. y_test_model = model.predict(x_test_filters)
  150. y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
  151. test_roc_auc = roc_auc_score(y_test, y_test_predict)
  152. end = datetime.datetime.now()
  153. diff = end - start
  154. print(f"Real sub-evaluation n°{index_number} took: {divmod(diff.days * 86400 + diff.seconds, 60)}, score found: {test_roc_auc}")
  155. return test_roc_auc
  156. # build all output folder and files based on `output` name
  157. backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
  158. surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
  159. surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
  160. if not os.path.exists(backup_model_folder):
  161. os.makedirs(backup_model_folder)
  162. if not os.path.exists(cfg.output_surrogates_model_folder):
  163. os.makedirs(cfg.output_surrogates_model_folder)
  164. if not os.path.exists(cfg.output_surrogates_data_folder):
  165. os.makedirs(cfg.output_surrogates_data_folder)
  166. backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
  167. ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
  168. surrogate_backup_file_path = os.path.join(backup_model_folder, p_output + '_train.csv')
  169. surrogate_k_indices_backup_file_path = os.path.join(backup_model_folder, p_output + '_k_indices.csv')
  170. surrogate_population_backup_file_path = os.path.join(backup_model_folder, p_output + '_population.csv')
  171. # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
  172. operators = [SimpleBinaryMutation(), SimpleMutation()]
  173. #policy = UCBPolicy(operators)
  174. policy = RandomPolicy(operators)
  175. # custom start surrogate variable based on problem size
  176. p_start = int(problem_size / p_k_division * 2) # 2 \times number of features for each sub-model
  177. # fixed minimal number of real evaluations
  178. if p_start < 50:
  179. p_start = 50
  180. print(f'Starting using surrogate after {p_start} reals training')
  181. # custom ILS for surrogate use
  182. algo = ILSMultiSpecificSurrogate(initalizer=init,
  183. evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
  184. sub_evaluator=sub_evaluate,
  185. operators=operators,
  186. policy=policy,
  187. validator=validator,
  188. output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
  189. surrogates_file_path=surrogate_output_model,
  190. start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
  191. solutions_folder=surrogate_output_data,
  192. ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
  193. k_division=p_k_division,
  194. k_dynamic=p_k_dynamic,
  195. k_random=p_k_random,
  196. generate_only=p_generate_only,
  197. maximise=True)
  198. algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
  199. #algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
  200. algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
  201. algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
  202. algo.addCallback(MultiSurrogateSpecificCheckpoint(every=p_ls_iteration, filepath=surrogate_population_backup_file_path)) # try every LS like this
  203. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  204. # print best solution found
  205. print("Found ", bestSol)
  206. # save model information into .csv file
  207. if not os.path.exists(cfg.results_information_folder):
  208. os.makedirs(cfg.results_information_folder)
  209. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  210. line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness)
  211. with open(filename_path, 'a') as f:
  212. f.write(line_info + '\n')
  213. print('Result saved into %s' % filename_path)
  214. if __name__ == "__main__":
  215. main()