Parcourir la source

use of backup for population; specific update of population

Jérôme BUISINE il y a 3 ans
Parent
commit
896e274d98

+ 5 - 2
find_best_attributes_surrogate_openML_multi_specific.py

@@ -43,6 +43,7 @@ from macop.callbacks.BasicCheckpoint import BasicCheckpoint
 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
 from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
 from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
+from optimization.callbacks.MultiSurrogateSpecificCheckpoint import MultiSurrogateSpecificCheckpoint
 
 from sklearn.ensemble import RandomForestClassifier
 
@@ -234,8 +235,9 @@ def main():
 
     backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
-    surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
-    surrogate_k_indices_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_k_indices.csv')
+    surrogate_backup_file_path = os.path.join(backup_model_folder, p_output + '_train.csv')
+    surrogate_k_indices_backup_file_path = os.path.join(backup_model_folder, p_output + '_k_indices.csv')
+    surrogate_population_backup_file_path = os.path.join(backup_model_folder, p_output + '_population.csv')
 
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     operators = [SimpleBinaryMutation(), SimpleMutation()]
@@ -273,6 +275,7 @@ def main():
     #algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
     algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
     algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
+    algo.addCallback(MultiSurrogateSpecificCheckpoint(every=p_ls_iteration, filepath=surrogate_population_backup_file_path)) # try every LS like this
 
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 

+ 18 - 28
optimization/ILSMultiSpecificSurrogate.py

@@ -99,6 +99,7 @@ class ILSMultiSpecificSurrogate(Algorithm):
         self._k_random = k_random
         self._k_indices = None
         self._surrogates = None
+        self._population = None
 
         self._generate_only = generate_only
         self._solutions_folder = solutions_folder
@@ -193,10 +194,6 @@ class ILSMultiSpecificSurrogate(Algorithm):
         current_learn = df.sample(training_samples)
         current_test = df.drop(current_learn.index)
 
-        # TODO : (check) not necessary now to select specific features indices into set
-        # current_learn = learn.copy()
-        # current_test = test.copy()
-
         problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
         model = Lasso(alpha=1e-5)
         surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
@@ -290,17 +287,10 @@ class ILSMultiSpecificSurrogate(Algorithm):
         """
 
         # for each indices set, get r^2 surrogate model and made prediction score
-
         num_cores = multiprocessing.cpu_count()
 
         r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
 
-        # for i, _ in enumerate(self._k_indices):
-        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
-        #     r_squared_scores.append(r_squared)
-
-        #print(r_squared_scores)
-
         return r_squared_scores
 
     def surrogates_mae(self):
@@ -310,17 +300,11 @@ class ILSMultiSpecificSurrogate(Algorithm):
             mae_scores: [{float}] -- mae scores from model
         """
 
-        # for each indices set, get r^2 surrogate model and made prediction score
-
+        # for each indices set, get mae surrogate model and made prediction score
         num_cores = multiprocessing.cpu_count()
 
         mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
 
-        # for i, _ in enumerate(self._k_indices):
-        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
-        #     r_squared_scores.append(r_squared)
-
-        #print(mae_scores)
 
         return mae_scores
 
@@ -361,11 +345,7 @@ class ILSMultiSpecificSurrogate(Algorithm):
         # initialize current solution
         self.initRun()
 
-        # enable resuming for ILS
-        self.resume()
-
-        if self._k_indices is None:
-            self.init_k_split_indices()
+        self.init_k_split_indices()
 
         # add norm to indentify sub problem data
         self.init_solutions_files()
@@ -374,6 +354,9 @@ class ILSMultiSpecificSurrogate(Algorithm):
         self.define_sub_evaluators()
         self.init_population()
 
+        # enable resuming for ILS
+        self.resume()
+
         # count number of surrogate obtained and restart using real evaluations done for each surrogate (sub-model)
         if (self._start_train_surrogates * self._k_division) > self.getGlobalEvaluation():
 
@@ -463,20 +446,27 @@ class ILSMultiSpecificSurrogate(Algorithm):
 
                     sub_problem_solution._score = fitness_score
 
-                    # if solution is really better after real evaluation, then we replace
-                    if self.isBetter(self._population[i]):
-                        self._population[i] = sub_problem_solution
+                    # if solution is really better after real evaluation, then we replace (depending of problem nature (minimizing / maximizing))
+                    if self._maximise:
+                        if sub_problem_solution.fitness() > self._population[i].fitness():
+                            self._population[i] = sub_problem_solution
+                    else:
+                        if sub_problem_solution.fitness() < self._population[i].fitness():
+                            self._population[i] = sub_problem_solution
 
                     self.add_to_surrogate(sub_problem_solution, i)
 
+            print(f'State of current population for surrogates ({len(self._population)} members)')
+            for i, s in enumerate(self._population):
+                print(f'Population[{i}]: best solution fitness is {s.fitness()}')
             
             # main best solution update
             if self._start_train_surrogates <= self.getGlobalEvaluation():
 
                 # need to create virtual solution from current population
-                obtained_solution_data = np.array([ s._data for s in self._population ]).flatten().tolist()
+                obtained_solution_data = np.array([ s._data for s in self._population ], dtype='object').flatten().tolist()
 
-                if obtained_solution_data == self._bestSolution.data:
+                if list(obtained_solution_data) == list(self._bestSolution._data):
                     print(f'-- No updates found from sub-model surrogates LS (best solution score: {self._bestSolution._score}')
                 else:
                     print(f'-- Updates found from sub-model surrogates LS')

+ 92 - 0
optimization/callbacks/MultiSurrogateSpecificCheckpoint.py

@@ -0,0 +1,92 @@
+"""Basic Checkpoint class implementation
+"""
+
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.Callback import Callback
+from macop.utils.color import macop_text, macop_line
+
+
+class MultiSurrogateSpecificCheckpoint(Callback):
+    """
+    MultiSurrogateSpecificCheckpoint is used for keep track of sub-surrogate problem indices
+
+    Attributes:
+        algo: {Algorithm} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current best solution
+        population = self._algo._population
+
+        # Do nothing is surrogate analyser does not exist
+        if population is None:
+            return
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info(f"Multi surrogate specific analysis checkpoint is done into {self._filepath}")
+
+            line = ''
+
+            fitness_list = [ s.fitness() for s in population ]
+            fitness_data = ' '.join(list(map(str, fitness_list)))
+
+            for s in population:
+                s_data = ' '.join(list(map(str, s._data)))
+                line += s_data + ';'
+
+            line += fitness_data
+
+            line += '\n'
+
+            # check if file exists
+            if not os.path.exists(self._filepath):
+                with open(self._filepath, 'w') as f:
+                    f.write(line)
+            else:
+                with open(self._filepath, 'a') as f:
+                    f.write(line)
+
+    def load(self):
+        """
+        Load previous population
+        """
+        if os.path.exists(self._filepath):
+
+            logging.info('Load population solutions from last checkpoint')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1].replace('\n', '')
+                data = lastline.split(';')
+
+                fitness_scores = list(map(float, data[-1].split(' ')))
+
+                for i, solution_data in enumerate(data[:-1]):
+                    self._algo._population[i]._data = list(map(int, solution_data.split(' ')))
+                    self._algo._population[i]._score = fitness_scores[i]
+
+            print(macop_line())
+            print(macop_text(f' MultiSurrogateSpecificCheckpoint found from `{self._filepath}` file. Start running using previous `population` values'))
+
+            for i, s in enumerate(self._algo._population):
+                print(f'Population[{i}]: best solution fitness is {s.fitness()}')
+
+        else:
+            print(macop_text('No backup found... Start running using new `population` values'))
+            logging.info("Can't load MultiSurrogateSpecific backup... Backup filepath not valid in  MultiSurrogateCheckpoint")
+
+        print(macop_line())
+