find_best_attributes_surrogate_dl.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. # main imports
  2. import os
  3. import sys
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8. import datetime
  9. import random
  10. # model imports
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  15. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization
  16. from keras.preprocessing.sequence import pad_sequences
  17. from keras.models import Sequential
  18. import joblib
  19. import sklearn
  20. import sklearn.svm as svm
  21. from sklearn.utils import shuffle
  22. from sklearn.metrics import roc_auc_score
  23. from sklearn.model_selection import cross_val_score
  24. # modules and config imports
  25. sys.path.insert(0, '') # trick to enable import of main folder module
  26. import custom_config as cfg
  27. import models as mdl
  28. from optimization.ILSSurrogate import ILSSurrogate
  29. from macop.solutions.BinarySolution import BinarySolution
  30. from macop.operators.mutators.SimpleMutation import SimpleMutation
  31. from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
  32. from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
  33. from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
  34. from macop.operators.policies.UCBPolicy import UCBPolicy
  35. from macop.callbacks.BasicCheckpoint import BasicCheckpoint
  36. from macop.callbacks.UCBCheckpoint import UCBCheckpoint
  37. from sklearn.ensemble import RandomForestClassifier
  38. # variables and parameters
  39. models_list = cfg.models_names_list
  40. def build_input(df):
  41. """Convert dataframe to numpy array input with timesteps as float array
  42. Arguments:
  43. df: {pd.Dataframe} -- Dataframe input
  44. Returns:
  45. {np.ndarray} -- input LSTM data as numpy array
  46. """
  47. arr = df.to_numpy()
  48. final_arr = []
  49. for v in arr:
  50. v_data = []
  51. for vv in v:
  52. #scaled_vv = np.array(vv, 'float') - np.mean(np.array(vv, 'float'))
  53. #v_data.append(scaled_vv)
  54. v_data.append(vv)
  55. final_arr.append(v_data)
  56. final_arr = np.array(final_arr, 'float32')
  57. return final_arr
  58. # default validator
  59. def validator(solution):
  60. # at least 5 attributes
  61. if list(solution._data).count(1) < 5:
  62. return False
  63. return True
  64. def create_model(input_shape):
  65. print ('Creating model...')
  66. model = Sequential()
  67. #model.add(Embedding(input_dim = 1000, output_dim = 50, input_length=input_length))
  68. model.add(LSTM(input_shape=input_shape, units=512, activation='tanh', recurrent_activation='sigmoid', dropout=0.4, return_sequences=True))
  69. model.add(LSTM(units=128, activation='tanh', recurrent_activation='sigmoid', dropout=0.4, return_sequences=True))
  70. model.add(LSTM(units=32, activation='tanh', dropout=0.4, recurrent_activation='sigmoid'))
  71. model.add(Dense(1, activation='sigmoid'))
  72. print ('Compiling...')
  73. model.compile(loss='binary_crossentropy',
  74. optimizer='rmsprop',
  75. #metrics=['accuracy', tf.keras.metrics.AUC()])
  76. metrics=['accuracy'])
  77. return model
  78. def loadDataset(filename):
  79. # TODO : load data using DL RNN
  80. ########################
  81. # 1. Get and prepare data
  82. ########################
  83. dataset_train = pd.read_csv(filename + '.train', header=None, sep=';')
  84. dataset_test = pd.read_csv(filename + '.test', header=None, sep=';')
  85. # getting weighted class over the whole dataset
  86. # line is composed of :: [scene_name; zone_id; image_index_end; label; data]
  87. noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
  88. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
  89. nb_noisy_train = len(noisy_df_train.index)
  90. nb_not_noisy_train = len(not_noisy_df_train.index)
  91. noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
  92. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
  93. nb_noisy_test = len(noisy_df_test.index)
  94. nb_not_noisy_test = len(not_noisy_df_test.index)
  95. noisy_samples = nb_noisy_test + nb_noisy_train
  96. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  97. total_samples = noisy_samples + not_noisy_samples
  98. print('noisy', noisy_samples)
  99. print('not_noisy', not_noisy_samples)
  100. print('total', total_samples)
  101. class_weight = {
  102. 0: noisy_samples / float(total_samples),
  103. 1: (not_noisy_samples / float(total_samples)),
  104. }
  105. # shuffle data
  106. final_df_train = sklearn.utils.shuffle(dataset_train)
  107. final_df_test = sklearn.utils.shuffle(dataset_test)
  108. # split dataset into X_train, y_train, X_test, y_test
  109. X_train_all = final_df_train.loc[:, 4:].apply(lambda x: x.astype(str).str.split(' '))
  110. X_train_all = build_input(X_train_all)
  111. y_train_all = final_df_train.loc[:, 3].astype('int')
  112. X_test = final_df_test.loc[:, 4:].apply(lambda x: x.astype(str).str.split(' '))
  113. X_test = build_input(X_test)
  114. y_test = final_df_test.loc[:, 3].astype('int')
  115. input_shape = (X_train_all.shape[1], X_train_all.shape[2])
  116. print('Training data input shape', input_shape)
  117. # prepare train and validation dataset
  118. X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, shuffle=False)
  119. return X_train, X_val, y_train, y_val, X_test, y_test, class_weight
  120. def main():
  121. parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
  122. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  123. parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=100)
  124. parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
  125. parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
  126. parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
  127. parser.add_argument('--every_ls', type=int, help='number of max iteration for retraining surrogate model', required=True)
  128. parser.add_argument('--output', type=str, help='output surrogate model name')
  129. args = parser.parse_args()
  130. p_data_file = args.data
  131. p_length = args.length
  132. p_start = args.start_surrogate
  133. p_ils_iteration = args.ils
  134. p_ls_iteration = args.ls
  135. p_every_ls = args.every_ls
  136. p_output = args.output
  137. print(p_data_file)
  138. # load data from file
  139. X_train, X_val, y_train, y_val, X_test, y_test, class_weight = loadDataset(p_data_file)
  140. # create `logs` folder if necessary
  141. if not os.path.exists(cfg.output_logs_folder):
  142. os.makedirs(cfg.output_logs_folder)
  143. logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
  144. # init solution (`n` attributes)
  145. def init():
  146. return BinarySolution([], p_length).random(validator)
  147. # define evaluate function here (need of data information)
  148. def evaluate(solution):
  149. start = datetime.datetime.now()
  150. # get indices of filters data to use (filters selection from solution)
  151. indices = []
  152. for index, value in enumerate(solution._data):
  153. if value == 1:
  154. indices.append(index)
  155. # keep only selected filters from solution
  156. x_train_filters = X_train[:, :, indices]
  157. x_val_filters = X_val[:, :, indices]
  158. x_test_filters = X_test[:, :, indices]
  159. # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
  160. # model = RandomForestClassifier(n_estimators=10)
  161. input_shape = (x_train_filters.shape[1], x_train_filters.shape[2])
  162. print('Training data input shape', input_shape)
  163. model = create_model(input_shape)
  164. model.summary()
  165. # model = model.fit(x_train_filters, y_train_filters)
  166. print("Fitting model with custom class_weight", class_weight)
  167. history = model.fit(x_train_filters, y_train, batch_size=128, epochs=30, validation_data=(x_val_filters, y_val), verbose=1, shuffle=True, class_weight=class_weight)
  168. y_test_model = model.predict(x_test_filters)
  169. y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
  170. test_roc_auc = roc_auc_score(y_test, y_test_predict)
  171. end = datetime.datetime.now()
  172. del model
  173. diff = end - start
  174. print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
  175. return test_roc_auc
  176. # build all output folder and files based on `output` name
  177. backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
  178. surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
  179. surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
  180. if not os.path.exists(backup_model_folder):
  181. os.makedirs(backup_model_folder)
  182. if not os.path.exists(cfg.output_surrogates_model_folder):
  183. os.makedirs(cfg.output_surrogates_model_folder)
  184. if not os.path.exists(cfg.output_surrogates_data_folder):
  185. os.makedirs(cfg.output_surrogates_data_folder)
  186. backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
  187. ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
  188. surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
  189. # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
  190. operators = [SimpleBinaryMutation(), SimpleMutation()]
  191. policy = UCBPolicy(operators)
  192. # define first line if necessary
  193. if not os.path.exists(surrogate_output_data):
  194. folder, _ = os.path.split(surrogate_output_data)
  195. if not os.path.exists(folder):
  196. os.makedirs(folder)
  197. with open(surrogate_output_data, 'w') as f:
  198. f.write('x;y\n')
  199. # custom ILS for surrogate use
  200. algo = ILSSurrogate(initalizer=init,
  201. evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
  202. operators=operators,
  203. policy=policy,
  204. validator=validator,
  205. surrogate_file_path=surrogate_output_model,
  206. start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
  207. solutions_file=surrogate_output_data,
  208. ls_train_surrogate=p_every_ls,
  209. maximise=True)
  210. algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
  211. algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
  212. algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
  213. bestSol = algo.run(p_ils_iteration, p_ls_iteration)
  214. # print best solution found
  215. print("Found ", bestSol)
  216. # save model information into .csv file
  217. if not os.path.exists(cfg.results_information_folder):
  218. os.makedirs(cfg.results_information_folder)
  219. filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
  220. filters_counter = 0
  221. # count number of filters
  222. for index, item in enumerate(bestSol.data):
  223. if index != 0 and index % 2 == 1:
  224. # if two attributes are used
  225. if item == 1 or bestSol.data[index - 1] == 1:
  226. filters_counter += 1
  227. line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
  228. with open(filename_path, 'a') as f:
  229. f.write(line_info + '\n')
  230. print('Result saved into %s' % filename_path)
  231. if __name__ == "__main__":
  232. main()