ILSMultiSpecificSurrogate.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. """Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
  2. """
  3. # main imports
  4. import os
  5. import logging
  6. import joblib
  7. import time
  8. import math
  9. import numpy as np
  10. import pandas as pd
  11. import random
  12. # parallel imports
  13. from joblib import Parallel, delayed
  14. import multiprocessing
  15. # module imports
  16. from macop.algorithms.Algorithm import Algorithm
  17. from macop.solutions.BinarySolution import BinarySolution
  18. from .LSSurrogate import LocalSearchSurrogate
  19. from .utils.SurrogateAnalysis import SurrogateAnalysis
  20. from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
  21. LassoCV, ElasticNet)
  22. from wsao.sao.problems.nd3dproblem import ND3DProblem
  23. from wsao.sao.surrogates.walsh import WalshSurrogate
  24. from wsao.sao.algos.fitter import FitterAlgo
  25. from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
  26. class ILSMultiSpecificSurrogate(Algorithm):
  27. """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate where each sub-surrogate learn from specific dataset
  28. Attributes:
  29. initalizer: {function} -- basic function strategy to initialize solution
  30. evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
  31. sub_evaluator: {function} -- sub evaluator function in order to obtained fitness for sub-model
  32. operators: {[Operator]} -- list of operator to use when launching algorithm
  33. policy: {Policy} -- Policy class implementation strategy to select operators
  34. validator: {function} -- basic function to check if solution is valid or not under some constraints
  35. maximise: {bool} -- specify kind of optimization problem
  36. currentSolution: {Solution} -- current solution managed for current evaluation
  37. bestSolution: {Solution} -- best solution found so far during running algorithm
  38. ls_iteration: {int} -- number of evaluation for each local search algorithm
  39. surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
  40. output_log_surrogates: {str} -- Log folder for surrogates training model
  41. start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
  42. surrogates: [{Surrogate}] -- Surrogates model instance loaded
  43. ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
  44. k_division: {int} -- number of expected division for current features problem
  45. k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
  46. k_random: {bool} -- random initialization of k_indices for each surrogate features model data
  47. generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
  48. solutions_folder: {str} -- Path where real evaluated solutions on subset are saved
  49. callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
  50. """
  51. def __init__(self,
  52. initalizer,
  53. evaluator,
  54. sub_evaluator,
  55. operators,
  56. policy,
  57. validator,
  58. surrogates_file_path,
  59. output_log_surrogates,
  60. start_train_surrogates,
  61. ls_train_surrogates,
  62. k_division,
  63. solutions_folder,
  64. k_random=True,
  65. k_dynamic=False,
  66. generate_only=False,
  67. maximise=True,
  68. parent=None):
  69. # set real evaluator as default
  70. super().__init__(initalizer, evaluator, operators, policy,
  71. validator, maximise, parent)
  72. self._n_local_search = 0
  73. self._total_n_local_search = 0
  74. self._main_evaluator = evaluator
  75. self._sub_evaluator = sub_evaluator
  76. self._surrogates_file_path = surrogates_file_path
  77. self._start_train_surrogates = start_train_surrogates
  78. self._output_log_surrogates = output_log_surrogates
  79. self._surrogate_evaluator = None
  80. self._surrogate_analyser = None
  81. self._ls_train_surrogates = ls_train_surrogates
  82. self._k_division = k_division
  83. self._k_dynamic = k_dynamic
  84. self._k_random = k_random
  85. self._k_indices = None
  86. self._surrogates = None
  87. self._generate_only = generate_only
  88. self._solutions_folder = solutions_folder
  89. def init_solutions_files(self):
  90. self._solutions_files = []
  91. if not os.path.exists(self._solutions_folder):
  92. os.makedirs(self._solutions_folder)
  93. # for each sub surrogate, associate its own surrogate file
  94. for i in range(len(self._k_indices)):
  95. index_str = str(i)
  96. while len(index_str) < 3:
  97. index_str = "0" + index_str
  98. solutions_path = os.path.join(self._solutions_folder, f'surrogate_data_{index_str}')
  99. # initialize solutions file if not exist
  100. if not os.path.exists(solutions_path):
  101. with open(solutions_path, 'w') as f:
  102. f.write('x;y\n')
  103. self._solutions_files.append(solutions_path)
  104. def define_sub_evaluators(self):
  105. self._sub_evaluators = []
  106. for i in range(len(self._k_indices)):
  107. # need to pass as default argument indices
  108. current_evaluator = lambda s, indices=self._k_indices[i]: self._sub_evaluator(s, indices)
  109. self._sub_evaluators.append(current_evaluator)
  110. def init_population(self):
  111. self._population = []
  112. # initialize the population
  113. for i in range(len(self._k_indices)):
  114. current_solution = self.pop_initializer(i)
  115. # compute fitness using sub-problem evaluator
  116. fitness_score = self._sub_evaluators[i](current_solution)
  117. current_solution._score = fitness_score
  118. self._population.append(current_solution)
  119. def pop_initializer(self, index):
  120. problem_size = len(self._k_indices[index])
  121. return BinarySolution([], problem_size).random(self._validator)
  122. def init_k_split_indices(self):
  123. """Initialize k_indices for the new training of surrogate
  124. Returns:
  125. k_indices: [description]
  126. """
  127. a = list(range(self._bestSolution._size))
  128. n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
  129. # TODO : (check) if random is possible or not
  130. # if self._k_random:
  131. # random.shuffle(a) # random subset
  132. splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
  133. self._k_division = len(splitted_indices) # update size of k if necessary
  134. self._k_indices = splitted_indices
  135. def train_surrogate(self, index, indices):
  136. # 1. Data sets preparation (train and test) use now of specific dataset for surrogate
  137. # dynamic number of samples based on dataset real evaluations
  138. nsamples = None
  139. with open(self._solutions_files[index], 'r') as f:
  140. nsamples = len(f.readlines()) - 1 # avoid header
  141. training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
  142. df = pd.read_csv(self._solutions_files[index], sep=';')
  143. # learning set and test set
  144. current_learn = df.sample(training_samples)
  145. current_test = df.drop(learn.index)
  146. # TODO : (check) not necessary now to select specific features indices into set
  147. # current_learn = learn.copy()
  148. # current_test = test.copy()
  149. problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
  150. model = Lasso(alpha=1e-5)
  151. surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
  152. analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
  153. algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
  154. print(f"Start fitting again the surrogate model n°{index}, using {training_samples} of {nsamples} samples for train dataset")
  155. for r in range(10):
  156. print(f"Iteration n°{r}: for fitting surrogate n°{index}")
  157. algo.run_samples(learn=current_learn, test=current_test, step=10)
  158. # keep well ordered surrogate into file manager
  159. str_index = str(index)
  160. while len(str_index) < 6:
  161. str_index = "0" + str_index
  162. joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
  163. return str_index
  164. def train_surrogates(self):
  165. """Retrain if necessary the whole surrogate fitness approximation function
  166. """
  167. # Following https://gitlab.com/florianlprt/wsao, we re-train the model
  168. # ---------------------------------------------------------------------------
  169. # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
  170. # model=lasso,alpha=1e-5 \
  171. # surrogate=walsh,order=3 \
  172. # algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
  173. # sample=1000,step=10 \
  174. # analysis=fitter,logfile=out_fit.csv
  175. # 1. for each sub space indices, learn new surrogate
  176. if not os.path.exists(self._surrogates_file_path):
  177. os.makedirs(self._surrogates_file_path)
  178. num_cores = multiprocessing.cpu_count()
  179. if not os.path.exists(self._output_log_surrogates):
  180. os.makedirs(self._output_log_surrogates)
  181. Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, indices) for index, indices in enumerate(self._k_indices))
  182. def load_surrogates(self):
  183. """Load algorithm with surrogate model and create lambda evaluator function
  184. """
  185. # need to first train surrogate if not exist
  186. if not os.path.exists(self._surrogates_file_path):
  187. self.train_surrogates()
  188. self._surrogates = []
  189. surrogates_path = sorted(os.listdir(self._surrogates_file_path))
  190. for surrogate_p in surrogates_path:
  191. model_path = os.path.join(self._surrogates_file_path, surrogate_p)
  192. surrogate_model = joblib.load(model_path)
  193. self._surrogates.append(surrogate_model)
  194. def surrogate_evaluator(self, solution):
  195. """Compute mean of each surrogate model using targeted indices
  196. Args:
  197. solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
  198. Return:
  199. mean: {float} -- mean score of surrogate models
  200. """
  201. scores = []
  202. solution_data = np.array(solution._data)
  203. # for each indices set, get trained surrogate model and made prediction score
  204. for i, indices in enumerate(self._k_indices):
  205. current_data = solution_data[indices]
  206. current_score = self._surrogates[i].surrogate.predict([current_data])[0]
  207. scores.append(current_score)
  208. return sum(scores) / len(scores)
  209. def surrogates_coefficient_of_determination(self):
  210. """Compute r² for each sub surrogate model
  211. Return:
  212. r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
  213. """
  214. # for each indices set, get r^2 surrogate model and made prediction score
  215. num_cores = multiprocessing.cpu_count()
  216. r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
  217. # for i, _ in enumerate(self._k_indices):
  218. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  219. # r_squared_scores.append(r_squared)
  220. #print(r_squared_scores)
  221. return r_squared_scores
  222. def surrogates_mae(self):
  223. """Compute mae for each sub surrogate model
  224. Return:
  225. mae_scores: [{float}] -- mae scores from model
  226. """
  227. # for each indices set, get r^2 surrogate model and made prediction score
  228. num_cores = multiprocessing.cpu_count()
  229. mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
  230. # for i, _ in enumerate(self._k_indices):
  231. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  232. # r_squared_scores.append(r_squared)
  233. #print(mae_scores)
  234. return mae_scores
  235. def add_to_surrogate(self, solution, index):
  236. # save real evaluated solution into specific file for surrogate
  237. with open(self._solutions_files[index], 'a') as f:
  238. line = ""
  239. for index, e in enumerate(solution._data):
  240. line += str(e)
  241. if index < len(solution._data) - 1:
  242. line += ","
  243. line += ";"
  244. line += str(solution._score)
  245. f.write(line + "\n")
  246. def run(self, evaluations, ls_evaluations=100):
  247. """
  248. Run the iterated local search algorithm using local search (EvE compromise)
  249. Args:
  250. evaluations: {int} -- number of global evaluations for ILS
  251. ls_evaluations: {int} -- number of Local search evaluations (default: 100)
  252. Returns:
  253. {Solution} -- best solution found
  254. """
  255. # by default use of mother method to initialize variables
  256. super().run(evaluations)
  257. # initialize current solution
  258. self.initRun()
  259. # enable resuming for ILS
  260. self.resume()
  261. if self._k_indices is None:
  262. self.init_k_split_indices()
  263. # add norm to indentify sub problem data
  264. self.init_solutions_files()
  265. # here we each surrogate sub evaluator
  266. self.define_sub_evaluators()
  267. self.init_population()
  268. # count number of surrogate obtained and restart using real evaluations done for each surrogate (sub-model)
  269. if (self._start_train_surrogates * self._k_division) > self.getGlobalEvaluation():
  270. # for each sub problem (surrogate)
  271. for i in range(self._k_division):
  272. nsamples = None
  273. with open(self._solutions_files[i], 'r') as f:
  274. nsamples = len(f.readlines()) - 1 # avoid header
  275. if nsamples is None:
  276. nsamples = 0
  277. # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
  278. # using randomly generated solutions (in order to cover seearch space)
  279. while self._start_train_surrogates > nsamples:
  280. print(f'Real solutions extraction for surrogate n°{i}: {nsamples} of {self._start_train_surrogates}')
  281. newSolution = self.pop_initializer(i)
  282. # evaluate new solution
  283. newSolution.evaluate(self._sub_evaluators[i])
  284. # add it to surrogate pool
  285. self.add_to_surrogate(newSolution, i)
  286. nsamples += 1
  287. # increase number of evaluation
  288. self.increaseEvaluation()
  289. # stop this process after generating solution
  290. if self._generate_only:
  291. return self._bestSolution
  292. # train surrogate on real evaluated solutions file
  293. self.train_surrogates()
  294. self.load_surrogates()
  295. # local search algorithm implementation
  296. while not self.stop():
  297. # set current evaluator based on used or not of surrogate function
  298. self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
  299. local_search_list = []
  300. for i in range(self._k_division):
  301. # create new local search instance
  302. # passing global evaluation param from ILS
  303. # use specific initializer for pop_initialiser
  304. # specific surrogate evaluator for this local search
  305. # TODO : check this part
  306. ls = LocalSearchSurrogate(lambda index=i: self.pop_initializer(index),
  307. lambda s: self._surrogates[i].surrogate.predict([s._data])[0],
  308. self._operators,
  309. self._policy,
  310. self._validator,
  311. self._maximise,
  312. parent=self)
  313. # add same callbacks
  314. for callback in self._callbacks:
  315. ls.addCallback(callback)
  316. local_search_list.append(ls)
  317. # parallel run of local search
  318. num_cores = multiprocessing.cpu_count()
  319. ls_solutions = Parallel(n_jobs=num_cores)(delayed(ls.run)(ls_evaluations) for ls in local_search_list)
  320. # create and search solution from local search
  321. self._numberOfEvaluations += ls_evaluations * self._k_division
  322. # for each sub problem, update population
  323. for i, sub_problem_solution in enumerate(ls_solutions):
  324. # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
  325. # Update : always add new solution into surrogate pool, not only if solution is better
  326. #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
  327. if self._start_train_surrogates <= self.getGlobalEvaluation():
  328. # if better solution found from local search, retrained the found solution and test again
  329. # without use of surrogate
  330. fitness_score = self._sub_evaluators[i](sub_problem_solution)
  331. # self.increaseEvaluation() # dot not add evaluation
  332. sub_problem_solution._score = fitness_score
  333. # if solution is really better after real evaluation, then we replace
  334. if self.isBetter(self._population[i]):
  335. self._population[i] = sub_problem_solution
  336. self.add_to_surrogate(sub_problem_solution, i)
  337. # main best solution update
  338. if self._start_train_surrogates <= self.getGlobalEvaluation():
  339. # need to create virtual solution
  340. obtained_solution_data = np.array([ s._data for s in ls_solutions ]).flatten().tolist()
  341. # init random solution
  342. current_solution = self._initializer()
  343. current_solution.data = obtained_solution_data
  344. fitness_score = self._main_evaluator(current_solution)
  345. # new computed solution score
  346. current_solution._score = fitness_score
  347. # if solution is really better after real evaluation, then we replace
  348. if self.isBetter(current_solution):
  349. self._bestSolution = current_solution
  350. print(f'-- Current solution obtained is {current_solution._score} vs. {self._bestSolution._score}')
  351. self.progress()
  352. # check using specific dynamic criteria based on r^2
  353. r_squared_scores = self.surrogates_coefficient_of_determination()
  354. r_squared = sum(r_squared_scores) / len(r_squared_scores)
  355. mae_scores = self.surrogates_mae()
  356. mae_score = sum(mae_scores) / len(mae_scores)
  357. r_squared_value = 0 if r_squared < 0 else r_squared
  358. training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
  359. # avoid issue when lauching every each local search
  360. if training_surrogate_every <= 0:
  361. training_surrogate_every = 1
  362. print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
  363. # check if necessary or not to train again surrogate
  364. if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
  365. # reinitialization of k_indices for the new training
  366. # TODO : remove this part temporally
  367. # if self._k_dynamic:
  368. # print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
  369. # self.init_k_split_indices()
  370. # train again surrogate on real evaluated solutions file
  371. start_training = time.time()
  372. self.train_surrogates()
  373. training_time = time.time() - start_training
  374. self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
  375. # reload new surrogate function
  376. self.load_surrogates()
  377. # reinitialize number of local search
  378. self._n_local_search = 0
  379. # increase number of local search done
  380. self._n_local_search += 1
  381. self._total_n_local_search += 1
  382. self.information()
  383. logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
  384. self.end()
  385. return self._bestSolution
  386. def addCallback(self, callback):
  387. """Add new callback to algorithm specifying usefull parameters
  388. Args:
  389. callback: {Callback} -- specific Callback instance
  390. """
  391. # specify current main algorithm reference
  392. if self.getParent() is not None:
  393. callback.setAlgo(self.getParent())
  394. else:
  395. callback.setAlgo(self)
  396. # set as new
  397. self._callbacks.append(callback)