ILSMultiSpecificSurrogate.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. """Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
  2. """
  3. # main imports
  4. import os
  5. import logging
  6. import joblib
  7. import time
  8. import math
  9. import numpy as np
  10. import pandas as pd
  11. import random
  12. # parallel imports
  13. from joblib import Parallel, delayed
  14. import multiprocessing
  15. # module imports
  16. from macop.algorithms.Algorithm import Algorithm
  17. from macop.solutions.BinarySolution import BinarySolution
  18. from .LSSurrogate import LocalSearchSurrogate
  19. from .utils.SurrogateAnalysis import SurrogateAnalysis
  20. from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
  21. LassoCV, ElasticNet)
  22. from wsao.sao.problems.nd3dproblem import ND3DProblem
  23. from wsao.sao.surrogates.walsh import WalshSurrogate
  24. from wsao.sao.algos.fitter import FitterAlgo
  25. from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
  26. class ILSMultiSpecificSurrogate(Algorithm):
  27. """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate where each sub-surrogate learn from specific dataset
  28. Attributes:
  29. initalizer: {function} -- basic function strategy to initialize solution
  30. evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
  31. sub_evaluator: {function} -- sub evaluator function in order to obtained fitness for sub-model
  32. operators: {[Operator]} -- list of operator to use when launching algorithm
  33. policy: {Policy} -- Policy class implementation strategy to select operators
  34. validator: {function} -- basic function to check if solution is valid or not under some constraints
  35. maximise: {bool} -- specify kind of optimization problem
  36. currentSolution: {Solution} -- current solution managed for current evaluation
  37. bestSolution: {Solution} -- best solution found so far during running algorithm
  38. ls_iteration: {int} -- number of evaluation for each local search algorithm
  39. surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
  40. output_log_surrogates: {str} -- Log folder for surrogates training model
  41. start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
  42. surrogates: [{Surrogate}] -- Surrogates model instance loaded
  43. ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
  44. k_division: {int} -- number of expected division for current features problem
  45. k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
  46. k_random: {bool} -- random initialization of k_indices for each surrogate features model data
  47. generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
  48. solutions_folder: {str} -- Path where real evaluated solutions on subset are saved
  49. callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
  50. """
  51. def __init__(self,
  52. initalizer,
  53. evaluator,
  54. sub_evaluator,
  55. operators,
  56. policy,
  57. validator,
  58. surrogates_file_path,
  59. output_log_surrogates,
  60. start_train_surrogates,
  61. ls_train_surrogates,
  62. k_division,
  63. solutions_folder,
  64. k_random=True,
  65. k_dynamic=False,
  66. generate_only=False,
  67. maximise=True,
  68. parent=None):
  69. # set real evaluator as default
  70. super().__init__(initalizer, evaluator, operators, policy,
  71. validator, maximise, parent)
  72. self._n_local_search = 0
  73. self._total_n_local_search = 0
  74. self._main_evaluator = evaluator
  75. self._sub_evaluator = sub_evaluator
  76. self._surrogates_file_path = surrogates_file_path
  77. self._start_train_surrogates = start_train_surrogates
  78. self._output_log_surrogates = output_log_surrogates
  79. self._surrogate_evaluator = None
  80. self._surrogate_analyser = None
  81. self._ls_train_surrogates = ls_train_surrogates
  82. self._k_division = k_division
  83. self._k_dynamic = k_dynamic
  84. self._k_random = k_random
  85. self._k_indices = None
  86. self._surrogates = None
  87. self._generate_only = generate_only
  88. # add norm to indentify sub problem data
  89. self._solutions_files = []
  90. self._solutions_folder = solutions_folder
  91. if not os.path.exists(self._solutions_folder):
  92. os.makedirs(self._solutions_folder)
  93. # for each sub surrogate, associate its own surrogate file
  94. for i in range(self._k_division):
  95. index_str = str(i)
  96. while len(index_str) < 3:
  97. index_str = "0" + index_str
  98. solutions_path = os.path.join(self._solutions_folder, f'surrogate_data_{index_str}')
  99. # initialize solutions file if not exist
  100. if not os.path.exists(solutions_path):
  101. with open(solutions_path, 'w') as f:
  102. f.write('x;y\n')
  103. self._solutions_files.append(solutions_path)
  104. def define_sub_evaluators(self):
  105. self._sub_evaluators = []
  106. for i in range(self._k_division):
  107. self._sub_evaluators.append(lambda s: self._sub_evaluator(s, self._k_indices[i]))
  108. def init_population(self):
  109. self._population = []
  110. # initialize the population
  111. for i in range(self._k_division):
  112. current_solution = self.pop_initializer(i)
  113. # compute fitness using sub-problem evaluator
  114. fitness_score = self._sub_evaluators[i](current_solution)
  115. current_solution._score = fitness_score
  116. self._population.append(current_solution)
  117. def pop_initializer(self, index):
  118. problem_size = len(self._k_indices[index])
  119. return BinarySolution([], problem_size).random(self._validator)
  120. def init_k_split_indices(self):
  121. """Initialize k_indices for the new training of surrogate
  122. Returns:
  123. k_indices: [description]
  124. """
  125. a = list(range(self._bestSolution._size))
  126. n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
  127. # TODO : (check) if random is possible or not
  128. # if self._k_random:
  129. # random.shuffle(a) # random subset
  130. splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
  131. return splitted_indices
  132. def train_surrogate(self, index, indices):
  133. # 1. Data sets preparation (train and test) use now of specific dataset for surrogate
  134. # dynamic number of samples based on dataset real evaluations
  135. nsamples = None
  136. with open(self._solutions_files[index], 'r') as f:
  137. nsamples = len(f.readlines()) - 1 # avoid header
  138. training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
  139. df = pd.read_csv(self._solutions_files[index], sep=';')
  140. # learning set and test set
  141. learn = df.sample(training_samples)
  142. test = df.drop(learn.index)
  143. # TODO : (check) not necessary now to select specific features indices into set
  144. current_learn = learn.copy()
  145. # current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
  146. current_test = test.copy()
  147. # current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
  148. problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
  149. model = Lasso(alpha=1e-5)
  150. surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
  151. analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
  152. algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
  153. print(f"Start fitting again the surrogate model n°{index}, using {training_samples} of {nsamples} samples for train dataset")
  154. for r in range(10):
  155. print(f"Iteration n°{r}: for fitting surrogate n°{index}")
  156. algo.run_samples(learn=current_learn, test=current_test, step=10)
  157. # keep well ordered surrogate into file manager
  158. str_index = str(index)
  159. while len(str_index) < 6:
  160. str_index = "0" + str_index
  161. joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
  162. return str_index
  163. def train_surrogates(self):
  164. """Retrain if necessary the whole surrogate fitness approximation function
  165. """
  166. # Following https://gitlab.com/florianlprt/wsao, we re-train the model
  167. # ---------------------------------------------------------------------------
  168. # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
  169. # model=lasso,alpha=1e-5 \
  170. # surrogate=walsh,order=3 \
  171. # algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
  172. # sample=1000,step=10 \
  173. # analysis=fitter,logfile=out_fit.csv
  174. # 1. for each sub space indices, learn new surrogate
  175. if not os.path.exists(self._surrogates_file_path):
  176. os.makedirs(self._surrogates_file_path)
  177. num_cores = multiprocessing.cpu_count()
  178. if not os.path.exists(self._output_log_surrogates):
  179. os.makedirs(self._output_log_surrogates)
  180. Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, indices) for index, indices in enumerate(self._k_indices))
  181. def load_surrogates(self):
  182. """Load algorithm with surrogate model and create lambda evaluator function
  183. """
  184. # need to first train surrogate if not exist
  185. if not os.path.exists(self._surrogates_file_path):
  186. self.train_surrogates()
  187. self._surrogates = []
  188. surrogates_path = sorted(os.listdir(self._surrogates_file_path))
  189. for surrogate_p in surrogates_path:
  190. model_path = os.path.join(self._surrogates_file_path, surrogate_p)
  191. surrogate_model = joblib.load(model_path)
  192. self._surrogates.append(surrogate_model)
  193. def surrogate_evaluator(self, solution):
  194. """Compute mean of each surrogate model using targeted indices
  195. Args:
  196. solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
  197. Return:
  198. mean: {float} -- mean score of surrogate models
  199. """
  200. scores = []
  201. solution_data = np.array(solution._data)
  202. # for each indices set, get trained surrogate model and made prediction score
  203. for i, indices in enumerate(self._k_indices):
  204. current_data = solution_data[indices]
  205. current_score = self._surrogates[i].surrogate.predict([current_data])[0]
  206. scores.append(current_score)
  207. return sum(scores) / len(scores)
  208. def surrogates_coefficient_of_determination(self):
  209. """Compute r² for each sub surrogate model
  210. Return:
  211. r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
  212. """
  213. # for each indices set, get r^2 surrogate model and made prediction score
  214. num_cores = multiprocessing.cpu_count()
  215. r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
  216. # for i, _ in enumerate(self._k_indices):
  217. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  218. # r_squared_scores.append(r_squared)
  219. #print(r_squared_scores)
  220. return r_squared_scores
  221. def surrogates_mae(self):
  222. """Compute mae for each sub surrogate model
  223. Return:
  224. mae_scores: [{float}] -- mae scores from model
  225. """
  226. # for each indices set, get r^2 surrogate model and made prediction score
  227. num_cores = multiprocessing.cpu_count()
  228. mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
  229. # for i, _ in enumerate(self._k_indices):
  230. # r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
  231. # r_squared_scores.append(r_squared)
  232. #print(mae_scores)
  233. return mae_scores
  234. def add_to_surrogate(self, solution, index):
  235. # save real evaluated solution into specific file for surrogate
  236. with open(self._solutions_files[index], 'a') as f:
  237. line = ""
  238. for index, e in enumerate(solution._data):
  239. line += str(e)
  240. if index < len(solution._data) - 1:
  241. line += ","
  242. line += ";"
  243. line += str(solution._score)
  244. f.write(line + "\n")
  245. def run(self, evaluations, ls_evaluations=100):
  246. """
  247. Run the iterated local search algorithm using local search (EvE compromise)
  248. Args:
  249. evaluations: {int} -- number of global evaluations for ILS
  250. ls_evaluations: {int} -- number of Local search evaluations (default: 100)
  251. Returns:
  252. {Solution} -- best solution found
  253. """
  254. # by default use of mother method to initialize variables
  255. super().run(evaluations)
  256. # initialize current solution
  257. self.initRun()
  258. # based on best solution found, initialize k pool indices
  259. if self._k_indices == None:
  260. self._k_indices = self.init_k_split_indices()
  261. # enable resuming for ILS
  262. self.resume()
  263. # here we each surrogate sub evaluator
  264. self.define_sub_evaluators()
  265. self.init_population()
  266. # count number of surrogate obtained and restart using real evaluations done for each surrogate (sub-model)
  267. if (self._start_train_surrogates * self._k_division) > self.getGlobalEvaluation():
  268. # for each sub problem (surrogate)
  269. for i in range(self._k_division):
  270. nsamples = None
  271. with open(self._solutions_files[i], 'r') as f:
  272. nsamples = len(f.readlines()) - 1 # avoid header
  273. if nsamples is None:
  274. nsamples = 0
  275. # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
  276. # using randomly generated solutions (in order to cover seearch space)
  277. while self._start_train_surrogates > nsamples:
  278. print(f'Real solutions extraction for surrogate n°{i}: {nsamples} of {self._start_train_surrogates}')
  279. newSolution = self.pop_initializer(i)
  280. # evaluate new solution
  281. newSolution.evaluate(self._sub_evaluators[i])
  282. # add it to surrogate pool
  283. self.add_to_surrogate(newSolution, i)
  284. nsamples += 1
  285. # increase number of evaluation
  286. self.increaseEvaluation()
  287. # stop this process after generating solution
  288. if self._generate_only:
  289. return self._bestSolution
  290. # train surrogate on real evaluated solutions file
  291. self.train_surrogates()
  292. self.load_surrogates()
  293. # local search algorithm implementation
  294. while not self.stop():
  295. # set current evaluator based on used or not of surrogate function
  296. self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
  297. local_search_list = []
  298. for i in range(self._k_division):
  299. # create new local search instance
  300. # passing global evaluation param from ILS
  301. # use specific initializer for pop_initialiser
  302. # specific surrogate evaluator for this local search
  303. # TODO : check this part
  304. ls = LocalSearchSurrogate(lambda index=i: self.pop_initializer(index),
  305. lambda s: self._surrogates[i].surrogate.predict([s._data])[0],
  306. self._operators,
  307. self._policy,
  308. self._validator,
  309. self._maximise,
  310. parent=self)
  311. # add same callbacks
  312. for callback in self._callbacks:
  313. ls.addCallback(callback)
  314. local_search_list.append(ls)
  315. # parallel run of local search
  316. num_cores = multiprocessing.cpu_count()
  317. ls_solutions = Parallel(n_jobs=num_cores)(delayed(ls.run)(ls_evaluations) for ls in local_search_list)
  318. # create and search solution from local search
  319. self._numberOfEvaluations += ls_evaluations * self._k_division
  320. # for each sub problem, update population
  321. for i, sub_problem_solution in enumerate(ls_solutions):
  322. # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
  323. # Update : always add new solution into surrogate pool, not only if solution is better
  324. #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
  325. if self._start_train_surrogates <= self.getGlobalEvaluation():
  326. # if better solution found from local search, retrained the found solution and test again
  327. # without use of surrogate
  328. fitness_score = self._sub_evaluators[i](sub_problem_solution)
  329. # self.increaseEvaluation() # dot not add evaluation
  330. sub_problem_solution._score = fitness_score
  331. # if solution is really better after real evaluation, then we replace
  332. if self.isBetter(self._population[i]):
  333. self._population[i] = sub_problem_solution
  334. self.add_to_surrogate(sub_problem_solution, i)
  335. # main best solution update
  336. if self._start_train_surrogates <= self.getGlobalEvaluation():
  337. # need to create virtual solution
  338. obtained_solution_data = np.array([ s._data for s in ls_solutions ]).flatten().tolist()
  339. # init random solution
  340. current_solution = self._initializer()
  341. current_solution.data = obtained_solution_data
  342. fitness_score = self._main_evaluator(current_solution)
  343. # new computed solution score
  344. current_solution._score = fitness_score
  345. # if solution is really better after real evaluation, then we replace
  346. if self.isBetter(current_solution):
  347. self._bestSolution = current_solution
  348. print(f'-- Current solution obtained is {current_solution._score} vs. {self._bestSolution._score}')
  349. self.progress()
  350. # check using specific dynamic criteria based on r^2
  351. r_squared_scores = self.surrogates_coefficient_of_determination()
  352. r_squared = sum(r_squared_scores) / len(r_squared_scores)
  353. mae_scores = self.surrogates_mae()
  354. mae_score = sum(mae_scores) / len(mae_scores)
  355. r_squared_value = 0 if r_squared < 0 else r_squared
  356. training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
  357. # avoid issue when lauching every each local search
  358. if training_surrogate_every <= 0:
  359. training_surrogate_every = 1
  360. print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
  361. # check if necessary or not to train again surrogate
  362. if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
  363. # reinitialization of k_indices for the new training
  364. # TODO : remove this part temporally
  365. # if self._k_dynamic:
  366. # print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
  367. # self.init_k_split_indices()
  368. # train again surrogate on real evaluated solutions file
  369. start_training = time.time()
  370. self.train_surrogates()
  371. training_time = time.time() - start_training
  372. self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
  373. # reload new surrogate function
  374. self.load_surrogates()
  375. # reinitialize number of local search
  376. self._n_local_search = 0
  377. # increase number of local search done
  378. self._n_local_search += 1
  379. self._total_n_local_search += 1
  380. self.information()
  381. logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
  382. self.end()
  383. return self._bestSolution
  384. def addCallback(self, callback):
  385. """Add new callback to algorithm specifying usefull parameters
  386. Args:
  387. callback: {Callback} -- specific Callback instance
  388. """
  389. # specify current main algorithm reference
  390. if self.getParent() is not None:
  391. callback.setAlgo(self.getParent())
  392. else:
  393. callback.setAlgo(self)
  394. # set as new
  395. self._callbacks.append(callback)