find_best_attributes_surrogate_openML.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization
  16. from keras.preprocessing.sequence import pad_sequences
  17. from keras.models import Sequential
  18. import joblib
  19. import sklearn
  20. import sklearn.svm as svm
  21. from sklearn.utils import shuffle
  22. from sklearn.metrics import roc_auc_score
  23. from sklearn.model_selection import cross_val_score
  24. from sklearn.preprocessing import MinMaxScaler
  25. # modules and config imports
  26. sys.path.insert(0, '') # trick to enable import of main folder module
  27. import custom_config as cfg
  28. import models as mdl
  29. from optimization.ILSSurrogate import ILSSurrogate
  30. from macop.solutions.BinarySolution import BinarySolution
  31. from macop.operators.mutators.SimpleMutation import SimpleMutation
  32. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  33. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  34. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  35. from macop.operators.policies.UCBPolicy import UCBPolicy
  36. from macop.callbacks.BasicCheckpoint import BasicCheckpoint
  37. from macop.callbacks.UCBCheckpoint import UCBCheckpoint
  38. from sklearn.ensemble import RandomForestClassifier
  39. # default validator
  40. def validator(solution):
  41. # at least 5 attributes
  42. if list(solution.data).count(1) < 5:
  43. return False
  44. return True
  45. def train_model(X_train, y_train):
  46. print ('Creating model...')
  47. # here use of SVM with grid search CV
  48. Cs = [0.001, 0.01, 0.1, 1, 10, 100]
  49. gammas = [0.001, 0.01, 0.1,10, 100]
  50. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  51. svc = svm.SVC(probability=True, class_weight='balanced')
  52. #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
  53. clf = GridSearchCV(svc, param_grid, cv=4, verbose=1, n_jobs=-1)
  54. clf.fit(X_train, y_train)
  55. model = clf.best_estimator_
  56. return model
  57. def loadDataset(filename):
  58. ########################
  59. # 1. Get and prepare data
  60. ########################
  61. dataset = pd.read_csv(filename, sep=',')
  62. # change label as common
  63. min_label_value = min(dataset.iloc[:, -1])
  64. max_label_value = max(dataset.iloc[:, -1])
  65. dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
  66. dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
  67. X_dataset = dataset.iloc[:, :-1]
  68. y_dataset = dataset.iloc[:, -1]
  69. problem_size = len(X_dataset.columns)
  70. # min/max normalisation over feature
  71. # create a scaler object
  72. scaler = MinMaxScaler()
  73. # fit and transform the data
  74. X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
  75. # prepare train, validation and test datasets
  76. X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
  77. return X_train, y_train, X_test, y_test, problem_size
  78. def main():
  79. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  80. parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
  81. parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
  82. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  83. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  84. parser.add_argument('--output', type=str, help='output surrogate model name')
  85. args = parser.parse_args()
  86. p_data_file = args.data
  87. p_every_ls = args.every_ls
  88. p_ils_iteration = args.ils
  89. p_ls_iteration = args.ls
  90. p_output = args.output
  91. # load data from file and get problem size
  92. X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
  93. # create `logs` folder if necessary
  94. if not os.path.exists(cfg.output_logs_folder):
  95. os.makedirs(cfg.output_logs_folder)
  96. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
  97. # init solution (`n` attributes)
  98. def init():
  99. return BinarySolution([], problem_size).random(validator)
  100. # define evaluate function here (need of data information)
  101. def evaluate(solution):
  102. start = datetime.datetime.now()
  103. # get indices of filters data to use (filters selection from solution)
  104. indices = []
  105. for index, value in enumerate(solution.data):
  106. if value == 1:
  107. indices.append(index)
  108. print(f'Training SVM with {len(indices)} from {len(solution.data)} available features')
  109. # keep only selected filters from solution
  110. x_train_filters = X_train[:, indices]
  111. x_test_filters = X_test[ :, indices]
  112. # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  113. model = train_model(x_train_filters, y_train)
  114. y_test_model = model.predict(x_test_filters)
  115. y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
  116. test_roc_auc = roc_auc_score(y_test, y_test_predict)
  117. end = datetime.datetime.now()
  118. diff = end - start
  119. print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  120. return test_roc_auc
  121. # build all output folder and files based on `output` name
  122. backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
  123. surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
  124. surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
  125. if not os.path.exists(backup_model_folder):
  126. os.makedirs(backup_model_folder)
  127. if not os.path.exists(cfg.output_surrogates_model_folder):
  128. os.makedirs(cfg.output_surrogates_model_folder)
  129. if not os.path.exists(cfg.output_surrogates_data_folder):
  130. os.makedirs(cfg.output_surrogates_data_folder)
  131. backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
  132. ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
  133. # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
  134. operators = [SimpleBinaryMutation(), SimpleMutation()]
  135. policy = UCBPolicy(operators)
  136. # define first line if necessary
  137. if not os.path.exists(surrogate_output_data):
  138. folder, _ = os.path.split(surrogate_output_data)
  139. if not os.path.exists(folder):
  140. os.makedirs(folder)
  141. with open(surrogate_output_data, 'w') as f:
  142. f.write('x;y\n')
  143. # custom start surrogate variable based on problem size
  144. p_start = int(0.5 * problem_size)
  145. print(f'Starting using surrogate after {p_start} reals training')
  146. # custom ILS for surrogate use
  147. algo = ILSSurrogate(_initalizer=init,
  148. _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
  149. _operators=operators,
  150. _policy=policy,
  151. _validator=validator,
  152. _surrogate_file_path=surrogate_output_model,
  153. _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
  154. _solutions_file=surrogate_output_data,
  155. _ls_train_surrogate=p_every_ls, # retrain surrogate every 5 iteration
  156. _maximise=True)
  157. algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
  158. algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
  159. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  160. # print best solution found
  161. print("Found ", bestSol)
  162. # save model information into .csv file
  163. if not os.path.exists(cfg.results_information_folder):
  164. os.makedirs(cfg.results_information_folder)
  165. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  166. line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
  167. with open(filename_path, 'a') as f:
  168. f.write(line_info + '\n')
  169. print('Result saved into %s' % filename_path)
  170. if __name__ == "__main__":
  171. main()