ILSMultiSpecificSurrogate.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. """Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
  2. """
  3. # main imports
  4. import os
  5. import logging
  6. import joblib
  7. import time
  8. import math
  9. import numpy as np
  10. import pandas as pd
  11. import random
  12. # parallel imports
  13. from joblib import Parallel, delayed
  14. import multiprocessing
  15. # module imports
  16. from macop.algorithms.Algorithm import Algorithm
  17. from macop.solutions.BinarySolution import BinarySolution
  18. from .LSSurrogate import LocalSearchSurrogate
  19. from .utils.SurrogateAnalysis import SurrogateAnalysis
  20. from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
  21. LassoCV, ElasticNet)
  22. from wsao.sao.problems.nd3dproblem import ND3DProblem
  23. from wsao.sao.surrogates.walsh import WalshSurrogate
  24. from wsao.sao.algos.fitter import FitterAlgo
  25. from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
  26. class ILSMultiSpecificSurrogate(Algorithm):
  27. """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate where each sub-surrogate learn from specific dataset
  28. Attributes:
  29. initalizer: {function} -- basic function strategy to initialize solution
  30. evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
  31. sub_evaluator: {function} -- sub evaluator function in order to obtained fitness for sub-model
  32. operators: {[Operator]} -- list of operator to use when launching algorithm
  33. policy: {Policy} -- Policy class implementation strategy to select operators
  34. validator: {function} -- basic function to check if solution is valid or not under some constraints
  35. maximise: {bool} -- specify kind of optimization problem
  36. currentSolution: {Solution} -- current solution managed for current evaluation
  37. bestSolution: {Solution} -- best solution found so far during running algorithm
  38. ls_iteration: {int} -- number of evaluation for each local search algorithm
  39. surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
  40. output_log_surrogates: {str} -- Log folder for surrogates training model
  41. start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
  42. surrogates: [{Surrogate}] -- Surrogates model instance loaded
  43. ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
  44. k_division: {int} -- number of expected division for current features problem
  45. k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
  46. k_random: {bool} -- random initialization of k_indices for each surrogate features model data
  47. generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
  48. solutions_folder: {str} -- Path where real evaluated solutions on subset are saved
  49. callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
  50. """
  51. def __init__(self,
  52. initalizer,
  53. evaluator,
  54. sub_evaluator,
  55. operators,
  56. policy,
  57. validator,
  58. surrogates_file_path,
  59. output_log_surrogates,
  60. start_train_surrogates,
  61. ls_train_surrogates,
  62. k_division,
  63. solutions_folder,
  64. k_random=True,
  65. k_dynamic=False,
  66. generate_only=False,
  67. maximise=True,
  68. parent=None):
  69. # set real evaluator as default
  70. super().__init__(initalizer, evaluator, operators, policy,
  71. validator, maximise, parent)
  72. self._n_local_search = 0
  73. self._total_n_local_search = 0
  74. self._main_evaluator = evaluator
  75. self._sub_evaluator = sub_evaluator
  76. self._surrogates_file_path = surrogates_file_path
  77. self._start_train_surrogates = start_train_surrogates
  78. self._output_log_surrogates = output_log_surrogates
  79. self._surrogate_evaluator = None
  80. self._surrogate_analyser = None
  81. self._ls_train_surrogates = ls_train_surrogates
  82. self._k_division = k_division
  83. self._k_dynamic = k_dynamic
  84. self._k_random = k_random
  85. self._k_indices = None
  86. self._surrogates = None
  87. self._population = None
  88. self._generate_only = generate_only
  89. self._solutions_folder = solutions_folder
  90. def init_solutions_files(self):
  91. self._solutions_files = []
  92. if not os.path.exists(self._solutions_folder):
  93. os.makedirs(self._solutions_folder)
  94. # for each sub surrogate, associate its own surrogate file
  95. for i in range(len(self._k_indices)):
  96. index_str = str(i)
  97. while len(index_str) < 3:
  98. index_str = "0" + index_str
  99. solutions_path = os.path.join(self._solutions_folder, f'surrogate_data_{index_str}')
  100. # initialize solutions file if not exist
  101. if not os.path.exists(solutions_path):
  102. with open(solutions_path, 'w') as f:
  103. f.write('x;y\n')
  104. self._solutions_files.append(solutions_path)
  105. def define_sub_evaluators(self):
  106. self._sub_evaluators = []
  107. for i in range(len(self._k_indices)):
  108. # need to pass as default argument indices
  109. current_evaluator = lambda s, number=i, indices=self._k_indices[i]: self._sub_evaluator(s, number, indices)
  110. self._sub_evaluators.append(current_evaluator)
  111. def init_population(self):
  112. self._population = []
  113. # initialize the population
  114. for i in range(len(self._k_indices)):
  115. current_solution = self.pop_initializer(i)
  116. # compute fitness using sub-problem evaluator
  117. fitness_score = self._sub_evaluators[i](current_solution)
  118. current_solution._score = fitness_score
  119. self._population.append(current_solution)
  120. def pop_initializer(self, index):
  121. problem_size = len(self._k_indices[index])
  122. return BinarySolution([], problem_size).random(self._validator)
  123. def init_k_split_indices(self):
  124. """Initialize k_indices for the new training of surrogate
  125. Returns:
  126. k_indices: [description]
  127. """
  128. a = list(range(self._bestSolution._size))
  129. n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
  130. # TODO : (check) if random is possible or not
  131. # if self._k_random:
  132. # random.shuffle(a) # random subset
  133. splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
  134. self._k_division = len(splitted_indices) # update size of k if necessary
  135. self._k_indices = splitted_indices
  136. def train_surrogate(self, index, indices):
  137. # 1. Data sets preparation (train and test) use now of specific dataset for surrogate
  138. # dynamic number of samples based on dataset real evaluations
  139. nsamples = None
  140. with open(self._solutions_files[index], 'r') as f:
  141. nsamples = len(f.readlines()) - 1 # avoid header
  142. training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
  143. df = pd.read_csv(self._solutions_files[index], sep=';')
  144. # learning set and test set
  145. current_learn = df.sample(training_samples)
  146. current_test = df.drop(current_learn.index)
  147. problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
  148. model = Lasso(alpha=1e-5)
  149. surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
  150. analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
  151. algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
  152. print(f"Start fitting again the surrogate model n°{index}, using {training_samples} of {nsamples} samples for train dataset")
  153. for r in range(10):
  154. print(f"Iteration n°{r}: for fitting surrogate n°{index}")
  155. algo.run_samples(learn=current_learn, test=current_test, step=10)
  156. # keep well ordered surrogate into file manager
  157. str_index = str(index)
  158. while len(str_index) < 6:
  159. str_index = "0" + str_index
  160. joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
  161. return str_index
  162. def train_surrogates(self):
  163. """Retrain if necessary the whole surrogate fitness approximation function
  164. """
  165. # Following https://gitlab.com/florianlprt/wsao, we re-train the model
  166. # ---------------------------------------------------------------------------
  167. # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
  168. # model=lasso,alpha=1e-5 \
  169. # surrogate=walsh,order=3 \
  170. # algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
  171. # sample=1000,step=10 \
  172. # analysis=fitter,logfile=out_fit.csv
  173. # 1. for each sub space indices, learn new surrogate
  174. if not os.path.exists(self._surrogates_file_path):
  175. os.makedirs(self._surrogates_file_path)
  176. num_cores = multiprocessing.cpu_count()
  177. if not os.path.exists(self._output_log_surrogates):
  178. os.makedirs(self._output_log_surrogates)
  179. Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, indices) for index, indices in enumerate(self._k_indices))
  180. def load_surrogates(self):
  181. """Load algorithm with surrogate model and create lambda evaluator function
  182. """
  183. # need to first train surrogate if not exist
  184. if not os.path.exists(self._surrogates_file_path):
  185. self.train_surrogates()
  186. self._surrogates = []
  187. surrogates_path = sorted(os.listdir(self._surrogates_file_path))
  188. for surrogate_p in surrogates_path:
  189. model_path = os.path.join(self._surrogates_file_path, surrogate_p)
  190. surrogate_model = joblib.load(model_path)
  191. self._surrogates.append(surrogate_model)
  192. def surrogate_evaluator(self, solution):
  193. """Compute mean of each surrogate model using targeted indices
  194. Args:
  195. solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
  196. Return:
  197. mean: {float} -- mean score of surrogate models
  198. """
  199. scores = []
  200. solution_data = np.array(solution._data)
  201. # for each indices set, get trained surrogate model and made prediction score
  202. for i, indices in enumerate(self._k_indices):
  203. current_data = solution_data[indices]
  204. current_score = self._surrogates[i].surrogate.predict([current_data])[0]
  205. scores.append(current_score)
  206. return sum(scores) / len(scores)
  207. def surrogates_coefficient_of_determination(self):
  208. """Compute r² for each sub surrogate model
  209. Return:
  210. r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
  211. """
  212. # for each indices set, get r^2 surrogate model and made prediction score
  213. num_cores = multiprocessing.cpu_count()
  214. r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
  215. return r_squared_scores
  216. def surrogates_mae(self):
  217. """Compute mae for each sub surrogate model
  218. Return:
  219. mae_scores: [{float}] -- mae scores from model
  220. """
  221. # for each indices set, get mae surrogate model and made prediction score
  222. num_cores = multiprocessing.cpu_count()
  223. mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
  224. return mae_scores
  225. def add_to_surrogate(self, solution, index):
  226. # save real evaluated solution into specific file for surrogate
  227. with open(self._solutions_files[index], 'a') as f:
  228. line = ""
  229. for index, e in enumerate(solution._data):
  230. line += str(e)
  231. if index < len(solution._data) - 1:
  232. line += ","
  233. line += ";"
  234. line += str(solution._score)
  235. f.write(line + "\n")
  236. def run(self, evaluations, ls_evaluations=100):
  237. """
  238. Run the iterated local search algorithm using local search (EvE compromise)
  239. Args:
  240. evaluations: {int} -- number of global evaluations for ILS
  241. ls_evaluations: {int} -- number of Local search evaluations (default: 100)
  242. Returns:
  243. {Solution} -- best solution found
  244. """
  245. # by default use of mother method to initialize variables
  246. super().run(evaluations)
  247. # initialize current solution
  248. self.initRun()
  249. self.init_k_split_indices()
  250. # add norm to indentify sub problem data
  251. self.init_solutions_files()
  252. # here we each surrogate sub evaluator
  253. self.define_sub_evaluators()
  254. self.init_population()
  255. # enable resuming for ILS
  256. self.resume()
  257. # count number of surrogate obtained and restart using real evaluations done for each surrogate (sub-model)
  258. if (self._start_train_surrogates * self._k_division) > self.getGlobalEvaluation():
  259. # for each sub problem (surrogate)
  260. for i in range(self._k_division):
  261. nsamples = None
  262. with open(self._solutions_files[i], 'r') as f:
  263. nsamples = len(f.readlines()) - 1 # avoid header
  264. if nsamples is None:
  265. nsamples = 0
  266. # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
  267. # using randomly generated solutions (in order to cover seearch space)
  268. while self._start_train_surrogates > nsamples:
  269. print(f'Real solutions extraction for surrogate n°{i}: {nsamples} of {self._start_train_surrogates}')
  270. newSolution = self.pop_initializer(i)
  271. # evaluate new solution
  272. newSolution.evaluate(self._sub_evaluators[i])
  273. # add it to surrogate pool
  274. self.add_to_surrogate(newSolution, i)
  275. nsamples += 1
  276. # increase number of evaluation
  277. self.increaseEvaluation()
  278. # stop this process after generating solution
  279. if self._generate_only:
  280. return self._bestSolution
  281. # train surrogate on real evaluated solutions file
  282. self.train_surrogates()
  283. self.load_surrogates()
  284. # local search algorithm implementation
  285. while not self.stop():
  286. # set current evaluator based on used or not of surrogate function
  287. self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
  288. local_search_list = []
  289. for i in range(self._k_division):
  290. # use specific initializer for pop_initialiser
  291. # specific surrogate evaluator for this local search
  292. ls = LocalSearchSurrogate(lambda index=i: self.pop_initializer(index),
  293. lambda s: self._surrogates[i].surrogate.predict([s._data])[0],
  294. self._operators,
  295. self._policy,
  296. self._validator,
  297. self._maximise,
  298. parent=self)
  299. # add same callbacks
  300. for callback in self._callbacks:
  301. ls.addCallback(callback)
  302. local_search_list.append(ls)
  303. # parallel run of each local search
  304. num_cores = multiprocessing.cpu_count()
  305. ls_solutions = Parallel(n_jobs=num_cores)(delayed(ls.run)(ls_evaluations) for ls in local_search_list)
  306. # create and search solution from local search
  307. self._numberOfEvaluations += ls_evaluations * self._k_division
  308. # for each sub problem, update population
  309. for i, sub_problem_solution in enumerate(ls_solutions):
  310. # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
  311. # Update : always add new solution into surrogate pool, not only if solution is better
  312. #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
  313. if self._start_train_surrogates <= self.getGlobalEvaluation():
  314. # if better solution found from local search, retrained the found solution and test again
  315. # without use of surrogate
  316. fitness_score = self._sub_evaluators[i](sub_problem_solution)
  317. # self.increaseEvaluation() # dot not add evaluation
  318. sub_problem_solution._score = fitness_score
  319. # if solution is really better after real evaluation, then we replace (depending of problem nature (minimizing / maximizing))
  320. if self._maximise:
  321. if sub_problem_solution.fitness() > self._population[i].fitness():
  322. self._population[i] = sub_problem_solution
  323. else:
  324. if sub_problem_solution.fitness() < self._population[i].fitness():
  325. self._population[i] = sub_problem_solution
  326. self.add_to_surrogate(sub_problem_solution, i)
  327. # main best solution update
  328. if self._start_train_surrogates <= self.getGlobalEvaluation():
  329. # need to create virtual solution from current population
  330. obtained_solution_data = np.array([ s._data for s in self._population ], dtype='object').flatten().tolist()
  331. if list(obtained_solution_data) == list(self._bestSolution._data):
  332. print(f'-- No updates found from sub-model surrogates LS (best solution score: {self._bestSolution._score}')
  333. else:
  334. print(f'-- Updates found into population from sub-model surrogates LS')
  335. # init random solution
  336. current_solution = self._initializer()
  337. current_solution.data = obtained_solution_data
  338. fitness_score = self._main_evaluator(current_solution)
  339. # new computed solution score
  340. current_solution._score = fitness_score
  341. # if solution is really better after real evaluation, then we replace
  342. if self.isBetter(current_solution):
  343. self._bestSolution = current_solution
  344. print(f'-- Current main solution from population is {current_solution._score} vs. {self._bestSolution._score}')
  345. self.progress()
  346. # main best solution update
  347. if self._start_train_surrogates <= self.getGlobalEvaluation():
  348. # need to create virtual solution from current population
  349. obtained_solution_data = np.array([ s._data for s in ls_solutions ], dtype='object').flatten().tolist()
  350. if list(obtained_solution_data) == list(self._bestSolution._data):
  351. print(f'-- No updates found from sub-model surrogates LS (best solution score: {self._bestSolution._score}')
  352. else:
  353. print(f'-- Updates found from sub-model surrogates LS')
  354. # init random solution
  355. current_solution = self._initializer()
  356. current_solution.data = obtained_solution_data
  357. fitness_score = self._main_evaluator(current_solution)
  358. # new computed solution score
  359. current_solution._score = fitness_score
  360. # if solution is really better after real evaluation, then we replace
  361. if self.isBetter(current_solution):
  362. print(f'Exploration solution obtained from LS surrogates enable improvment of main solution')
  363. self._bestSolution = current_solution
  364. print(f'Exploration solution obtained from LS surrogates enable improvment of main solution')
  365. # also update the whole population as restarting process if main solution is better
  366. for i, sub_problem_solution in enumerate(ls_solutions):
  367. # already evaluated sub solution
  368. self._population[i] = sub_problem_solution
  369. print(f'-- Current main solution obtained from `LS solutions` is {current_solution._score} vs. {self._bestSolution._score}')
  370. logging.info(f'-- Current main solution obtained from `LS solutions` is {current_solution._score} vs. {self._bestSolution._score}')
  371. self.progress()
  372. print(f'State of current population for surrogates ({len(self._population)} members)')
  373. for i, s in enumerate(self._population):
  374. print(f'Population[{i}]: best solution fitness is {s.fitness()}')
  375. # check using specific dynamic criteria based on r^2
  376. r_squared_scores = self.surrogates_coefficient_of_determination()
  377. r_squared = sum(r_squared_scores) / len(r_squared_scores)
  378. mae_scores = self.surrogates_mae()
  379. mae_score = sum(mae_scores) / len(mae_scores)
  380. r_squared_value = 0 if r_squared < 0 else r_squared
  381. training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
  382. # avoid issue when lauching every each local search
  383. if training_surrogate_every <= 0:
  384. training_surrogate_every = 1
  385. logging.info(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
  386. print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
  387. # check if necessary or not to train again surrogate
  388. if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
  389. # reinitialization of k_indices for the new training
  390. # TODO : remove this part temporally
  391. # if self._k_dynamic:
  392. # print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
  393. # self.init_k_split_indices()
  394. # train again surrogate on real evaluated solutions file
  395. start_training = time.time()
  396. self.train_surrogates()
  397. training_time = time.time() - start_training
  398. self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
  399. # reload new surrogate function
  400. self.load_surrogates()
  401. # reinitialize number of local search
  402. self._n_local_search = 0
  403. # increase number of local search done
  404. self._n_local_search += 1
  405. self._total_n_local_search += 1
  406. self.information()
  407. logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
  408. self.end()
  409. return self._bestSolution
  410. def addCallback(self, callback):
  411. """Add new callback to algorithm specifying usefull parameters
  412. Args:
  413. callback: {Callback} -- specific Callback instance
  414. """
  415. # specify current main algorithm reference
  416. if self.getParent() is not None:
  417. callback.setAlgo(self.getParent())
  418. else:
  419. callback.setAlgo(self)
  420. # set as new
  421. self._callbacks.append(callback)