ILSMultiSurrogate.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. """Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
  2. """
  3. # main imports
  4. import os
  5. import logging
  6. import joblib
  7. import time
  8. import math
  9. import numpy as np
  10. import pandas as pd
  11. import random
  12. # parallel imports
  13. from joblib import Parallel, delayed
  14. import multiprocessing
  15. # module imports
  16. from macop.algorithms.base import Algorithm
  17. from .LSSurrogate import LocalSearchSurrogate
  18. from .utils.SurrogateAnalysis import SurrogateAnalysis
  19. from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
  20. LassoCV, ElasticNet)
  21. from wsao.sao.problems.nd3dproblem import ND3DProblem
  22. from wsao.sao.surrogates.walsh import WalshSurrogate
  23. from wsao.sao.algos.fitter import FitterAlgo
  24. from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
  25. class ILSMultiSurrogate(Algorithm):
  26. """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate
  27. Attributes:
  28. initalizer: {function} -- basic function strategy to initialize solution
  29. evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
  30. operators: {[Operator]} -- list of operator to use when launching algorithm
  31. policy: {Policy} -- Policy class implementation strategy to select operators
  32. validator: {function} -- basic function to check if solution is valid or not under some constraints
  33. maximise: {bool} -- specify kind of optimization problem
  34. currentSolution: {Solution} -- current solution managed for current evaluation
  35. bestSolution: {Solution} -- best solution found so far during running algorithm
  36. ls_iteration: {int} -- number of evaluation for each local search algorithm
  37. surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
  38. output_log_surrogates: {str} -- Log folder for surrogates training model
  39. start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
  40. surrogates: [{Surrogate}] -- Surrogates model instance loaded
  41. ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
  42. k_division: {int} -- number of expected division for current features problem
  43. k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
  44. k_random: {bool} -- random initialization of k_indices for each surrogate features model data
  45. generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
  46. solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
  47. callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
  48. """
  49. def __init__(self,
  50. initalizer,
  51. evaluator,
  52. operators,
  53. policy,
  54. validator,
  55. surrogates_file_path,
  56. output_log_surrogates,
  57. start_train_surrogates,
  58. ls_train_surrogates,
  59. k_division,
  60. solutions_file,
  61. k_random=True,
  62. k_dynamic=False,
  63. generate_only=False,
  64. maximise=True,
  65. parent=None):
  66. # set real evaluator as default
  67. super().__init__(initalizer, evaluator, operators, policy,
  68. validator, maximise, parent)
  69. self._n_local_search = 0
  70. self._total_n_local_search = 0
  71. self._main_evaluator = evaluator
  72. self._surrogates_file_path = surrogates_file_path
  73. self._start_train_surrogates = start_train_surrogates
  74. self._output_log_surrogates = output_log_surrogates
  75. self._surrogate_evaluator = None
  76. self._surrogate_analyser = None
  77. self._ls_train_surrogates = ls_train_surrogates
  78. self._solutions_file = solutions_file
  79. self._k_division = k_division
  80. self._k_dynamic = k_dynamic
  81. self._k_random = k_random
  82. self._k_indices = None
  83. self._surrogates = None
  84. self._generate_only = generate_only
  85. def init_k_split_indices(self):
  86. """Initialize k_indices for the new training of surrogate
  87. Returns:
  88. k_indices: [description]
  89. """
  90. a = list(range(self._bestSolution._size))
  91. n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
  92. if self._k_random:
  93. random.shuffle(a) # random subset
  94. splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
  95. return splitted_indices
  96. def train_surrogate(self, index, learn, test, indices):
  97. current_learn = learn.copy()
  98. current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
  99. current_test = test.copy()
  100. current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
  101. problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
  102. model = Lasso(alpha=1e-5)
  103. surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
  104. analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
  105. algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
  106. print(f"Start fitting again the surrogate model n°{index}")
  107. for r in range(10):
  108. print(f"Iteration n°{r}: for fitting surrogate n°{index}")
  109. algo.run_samples(learn=current_learn, test=current_test, step=10)
  110. # keep well ordered surrogate into file manager
  111. str_index = str(index)
  112. while len(str_index) < 6:
  113. str_index = "0" + str_index
  114. joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
  115. return str_index
  116. def train_surrogates(self):
  117. """Retrain if necessary the whole surrogate fitness approximation function
  118. """
  119. # Following https://gitlab.com/florianlprt/wsao, we re-train the model
  120. # ---------------------------------------------------------------------------
  121. # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
  122. # model=lasso,alpha=1e-5 \
  123. # surrogate=walsh,order=3 \
  124. # algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
  125. # sample=1000,step=10 \
  126. # analysis=fitter,logfile=out_fit.csv
  127. # 1. Data sets preparation (train and test)
  128. # dynamic number of samples based on dataset real evaluations
  129. nsamples = None
  130. with open(self._solutions_file, 'r') as f:
  131. nsamples = len(f.readlines()) - 1 # avoid header
  132. training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
  133. df = pd.read_csv(self._solutions_file, sep=';')
  134. # learning set and test set
  135. learn = df.sample(training_samples)
  136. test = df.drop(learn.index)
  137. print(f'Training all surrogate models using {training_samples} of {nsamples} samples for train dataset')
  138. # 2. for each sub space indices, learn new surrogate
  139. if not os.path.exists(self._surrogates_file_path):
  140. os.makedirs(self._surrogates_file_path)
  141. num_cores = multiprocessing.cpu_count()
  142. if not os.path.exists(self._output_log_surrogates):
  143. os.makedirs(self._output_log_surrogates)
  144. Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, learn, test, indices) for index, indices in enumerate(self._k_indices))
  145. def load_surrogates(self):
  146. """Load algorithm with surrogate model and create lambda evaluator function
  147. """
  148. # need to first train surrogate if not exist
  149. if not os.path.exists(self._surrogates_file_path):
  150. self.train_surrogates()
  151. self._surrogates = []
  152. surrogates_path = sorted(os.listdir(self._surrogates_file_path))
  153. for surrogate_p in surrogates_path:
  154. model_path = os.path.join(self._surrogates_file_path, surrogate_p)
  155. surrogate_model = joblib.load(model_path)
  156. self._surrogates.append(surrogate_model)
  157. def surrogate_evaluator(self, solution):
  158. """Compute mean of each surrogate model using targeted indices
  159. Args:
  160. solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
  161. Return:
  162. mean: {float} -- mean score of surrogate models
  163. """
  164. scores = []
  165. solution_data = np.array(solution._data)
  166. # for each indices set, get trained surrogate model and made prediction score
  167. for i, indices in enumerate(self._k_indices):
  168. current_data = solution_data[indices]
  169. current_score = self._surrogates[i].surrogate.predict([current_data])[0]
  170. scores.append(current_score)
  171. return sum(scores) / len(scores)
  172. def surrogates_coefficient_of_determination(self):
  173. """Compute r² for each sub surrogate model
  174. Return:
  175. r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
  176. """
  177. # for each indices set, get r^2 surrogate model and made prediction score
  178. num_cores = multiprocessing.cpu_count()
  179. r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
  180. # for i, _ in enumerate(self._k_indices):
  181. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  182. # r_squared_scores.append(r_squared)
  183. #print(r_squared_scores)
  184. return r_squared_scores
  185. def surrogates_mae(self):
  186. """Compute mae for each sub surrogate model
  187. Return:
  188. mae_scores: [{float}] -- mae scores from model
  189. """
  190. # for each indices set, get r^2 surrogate model and made prediction score
  191. num_cores = multiprocessing.cpu_count()
  192. mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
  193. # for i, _ in enumerate(self._k_indices):
  194. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  195. # r_squared_scores.append(r_squared)
  196. #print(mae_scores)
  197. return mae_scores
  198. def add_to_surrogate(self, solution):
  199. # save real evaluated solution into specific file for surrogate
  200. with open(self._solutions_file, 'a') as f:
  201. line = ""
  202. for index, e in enumerate(solution._data):
  203. line += str(e)
  204. if index < len(solution._data) - 1:
  205. line += ","
  206. line += ";"
  207. line += str(solution._score)
  208. f.write(line + "\n")
  209. def run(self, evaluations, ls_evaluations=100):
  210. """
  211. Run the iterated local search algorithm using local search (EvE compromise)
  212. Args:
  213. evaluations: {int} -- number of global evaluations for ILS
  214. ls_evaluations: {int} -- number of Local search evaluations (default: 100)
  215. Returns:
  216. {Solution} -- best solution found
  217. """
  218. # by default use of mother method to initialize variables
  219. super().run(evaluations)
  220. # initialize current solution
  221. self.initRun()
  222. # based on best solution found, initialize k pool indices
  223. if self._k_indices == None:
  224. self._k_indices = self.init_k_split_indices()
  225. # enable resuming for ILS
  226. self.resume()
  227. # count number of surrogate obtained and restart using real evaluations done
  228. nsamples = None
  229. with open(self._solutions_file, 'r') as f:
  230. nsamples = len(f.readlines()) - 1 # avoid header
  231. if self.getGlobalEvaluation() < nsamples:
  232. print(f'Restart using {nsamples} of {self._start_train_surrogates} real evaluations obtained')
  233. self._numberOfEvaluations = nsamples
  234. if self._start_train_surrogates > self.getGlobalEvaluation():
  235. # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
  236. # using randomly generated solutions (in order to cover seearch space)
  237. while self._start_train_surrogates > self.getGlobalEvaluation():
  238. print(f'Real solutions extraction {self.getGlobalEvaluation()} of {self._start_train_surrogates}')
  239. newSolution = self._initializer()
  240. # evaluate new solution
  241. newSolution.evaluate(self._evaluator)
  242. # add it to surrogate pool
  243. self.add_to_surrogate(newSolution)
  244. self.increaseEvaluation()
  245. # stop this process after generating solution
  246. if self._generate_only:
  247. return self._bestSolution
  248. # train surrogate on real evaluated solutions file
  249. self.train_surrogates()
  250. self.load_surrogates()
  251. # local search algorithm implementation
  252. while not self.stop():
  253. # set current evaluator based on used or not of surrogate function
  254. self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
  255. # create new local search instance
  256. # passing global evaluation param from ILS
  257. ls = LocalSearchSurrogate(self._initializer,
  258. self._evaluator,
  259. self._operators,
  260. self._policy,
  261. self._validator,
  262. self._maximise,
  263. parent=self)
  264. # add same callbacks
  265. for callback in self._callbacks:
  266. ls.addCallback(callback)
  267. # create and search solution from local search
  268. newSolution = ls.run(ls_evaluations)
  269. # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
  270. # Update : always add new solution into surrogate pool, not only if solution is better
  271. #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
  272. if self._start_train_surrogates <= self.getGlobalEvaluation():
  273. # if better solution found from local search, retrained the found solution and test again
  274. # without use of surrogate
  275. fitness_score = self._main_evaluator(newSolution)
  276. # self.increaseEvaluation() # dot not add evaluation
  277. newSolution._score = fitness_score
  278. # if solution is really better after real evaluation, then we replace
  279. if self.isBetter(newSolution):
  280. self._bestSolution = newSolution
  281. self.add_to_surrogate(newSolution)
  282. self.progress()
  283. # check using specific dynamic criteria based on r^2
  284. r_squared_scores = self.surrogates_coefficient_of_determination()
  285. r_squared = sum(r_squared_scores) / len(r_squared_scores)
  286. mae_scores = self.surrogates_mae()
  287. mae_score = sum(mae_scores) / len(mae_scores)
  288. r_squared_value = 0 if r_squared < 0 else r_squared
  289. training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
  290. # avoid issue when lauching every each local search
  291. if training_surrogate_every <= 0:
  292. training_surrogate_every = 1
  293. print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
  294. # check if necessary or not to train again surrogate
  295. if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
  296. # reinitialization of k_indices for the new training
  297. if self._k_dynamic:
  298. print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
  299. self.init_k_split_indices()
  300. # train again surrogate on real evaluated solutions file
  301. start_training = time.time()
  302. self.train_surrogates()
  303. training_time = time.time() - start_training
  304. self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
  305. # reload new surrogate function
  306. self.load_surrogates()
  307. # reinitialize number of local search
  308. self._n_local_search = 0
  309. # increase number of local search done
  310. self._n_local_search += 1
  311. self._total_n_local_search += 1
  312. self.information()
  313. logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
  314. self.end()
  315. return self._bestSolution
  316. def addCallback(self, callback):
  317. """Add new callback to algorithm specifying usefull parameters
  318. Args:
  319. callback: {Callback} -- specific Callback instance
  320. """
  321. # specify current main algorithm reference
  322. if self.getParent() is not None:
  323. callback.setAlgo(self.getParent())
  324. else:
  325. callback.setAlgo(self)
  326. # set as new
  327. self._callbacks.append(callback)