il y a 4 ans · 2cb283287a
--- a/find_best_attributes_surrogate_openML_multi.py
+++ b/find_best_attributes_surrogate_openML_multi.py
@@ -117,10 +117,12 @@ def main():
 
																     parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
															
 
																     parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
															
 
																-    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', required=True)
															
 
																+    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
															
 
																     parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
															
 
																+    parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
															
 
																     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
															
 
																     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
															
 
																+    parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
															
 
																     parser.add_argument('--output', type=str, help='output surrogate model name')
															
 
																     args = parser.parse_args()
															
@@ -129,8 +131,10 @@ def main():
 
																     p_every_ls   = args.every_ls
															
 
																     p_k_division = args.k_division
															
 
																     p_k_dynamic = bool(args.k_dynamic)
															
 
																+    p_k_random = bool(args.k_random)
															
 
																     p_ils_iteration = args.ils
															
 
																     p_ls_iteration  = args.ls
															
 
																+    p_generate_only = bool(args.generate_only)
															
 
																     p_output = args.output
															
 
																     # load data from file and get problem size
															
@@ -229,12 +233,15 @@ def main():
 
																                         operators=operators, 
															
 
																                         policy=policy, 
															
 
																                         validator=validator,
															
 
																+                        output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
															
 
																                         surrogates_file_path=surrogate_output_model,
															
 
																                         start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
															
 
																                         solutions_file=surrogate_output_data,
															
 
																                         ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
															
 
																                         k_division=p_k_division,
															
 
																                         k_dynamic=p_k_dynamic,
															
 
																+                        k_random=p_k_random,
															
 
																+                        generate_only=p_generate_only,
															
 
																                         maximise=True)
															
 
																     algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
															
@@ -253,7 +260,7 @@ def main():
 
																     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
															
 
																-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
															
 
																+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness())
															
 
																     with open(filename_path, 'a') as f:
															
 
																         f.write(line_info + '\n')
															
--- a/optimization/ILSMultiSurrogate.py
+++ b/optimization/ILSMultiSurrogate.py
@@ -11,6 +11,10 @@ import numpy as np
 
																 import pandas as pd
															
 
																 import random
															
 
																+# parallel imports
															
 
																+from joblib import Parallel, delayed
															
 
																+import multiprocessing
															
 
																+
															
 
																 # module imports
															
 
																 from macop.algorithms.Algorithm import Algorithm
															
 
																 from .LSSurrogate import LocalSearchSurrogate
															
@@ -38,13 +42,15 @@ class ILSMultiSurrogate(Algorithm):
 
																         currentSolution: {Solution} -- current solution managed for current evaluation
															
 
																         bestSolution: {Solution} -- best solution found so far during running algorithm
															
 
																         ls_iteration: {int} -- number of evaluation for each local search algorithm
															
 
																-        surrogates_file: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
															
 
																+        surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
															
 
																+        output_log_surrogates: {str} -- Log folder for surrogates training model
															
 
																         start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
															
 
																         surrogates: [{Surrogate}] -- Surrogates model instance loaded
															
 
																         ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
															
 
																         k_division: {int} -- number of expected division for current features problem
															
 
																         k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
															
 
																         k_random: {bool} -- random initialization of k_indices for each surrogate features model data
															
 
																+        generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
															
 
																         solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
															
 
																         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
															
 
																     """
															
@@ -55,12 +61,14 @@ class ILSMultiSurrogate(Algorithm):
 
																                  policy,
															
 
																                  validator,
															
 
																                  surrogates_file_path,
															
 
																+                 output_log_surrogates,
															
 
																                  start_train_surrogates,
															
 
																                  ls_train_surrogates,
															
 
																                  k_division,
															
 
																                  solutions_file,
															
 
																                  k_random=True,
															
 
																                  k_dynamic=False,
															
 
																+                 generate_only=False,
															
 
																                  maximise=True,
															
 
																                  parent=None):
															
@@ -69,10 +77,12 @@ class ILSMultiSurrogate(Algorithm):
 
																                 validator, maximise, parent)
															
 
																         self._n_local_search = 0
															
 
																+        self._total_n_local_search = 0
															
 
																         self._main_evaluator = evaluator
															
 
																         self._surrogates_file_path = surrogates_file_path
															
 
																         self._start_train_surrogates = start_train_surrogates
															
 
																+        self._output_log_surrogates = output_log_surrogates
															
 
																         self._surrogate_evaluator = None
															
 
																         self._surrogate_analyser = None
															
@@ -86,6 +96,8 @@ class ILSMultiSurrogate(Algorithm):
 
																         self._k_indices = None
															
 
																         self._surrogates = None
															
 
																+        self._generate_only = generate_only
															
 
																+
															
 
																     def init_k_split_indices(self):
															
 
																         """Initialize k_indices for the new training of surrogate
															
@@ -101,6 +113,36 @@ class ILSMultiSurrogate(Algorithm):
 
																         splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
															
 
																         return splitted_indices
															
 
																+
															
 
																+
															
 
																+    def train_surrogate(self, index, learn, test, indices):
															
 
																+
															
 
																+        current_learn = learn.copy()
															
 
																+        current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
															
 
																+
															
 
																+        current_test = test.copy()
															
 
																+        current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
															
 
																+
															
 
																+        problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
															
 
																+        model = Lasso(alpha=1e-5)
															
 
																+        surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
															
 
																+        analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
															
 
																+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
															
 
																+
															
 
																+        print(f"Start fitting again the surrogate model n°{index}")
															
 
																+        for r in range(10):
															
 
																+            print(f"Iteration n°{r}: for fitting surrogate n°{index}")
															
 
																+            algo.run_samples(learn=current_learn, test=current_test, step=10)
															
 
																+
															
 
																+        # keep well ordered surrogate into file manager
															
 
																+        str_index = str(index)
															
 
																+
															
 
																+        while len(str_index) < 6:
															
 
																+            str_index = "0" + str_index
															
 
																+
															
 
																+        joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
															
 
																+
															
 
																+        return str_index
															
 
																     def train_surrogates(self):
															
@@ -115,9 +157,6 @@ class ILSMultiSurrogate(Algorithm):
 
																         #        sample=1000,step=10 \
															
 
																         #        analysis=fitter,logfile=out_fit.csv
															
 
																-        # TODO : pass run samples directly using train and test
															
 
																-        # TODO : use of multiprocessing commands for each surrogate
															
 
																-        # TODO : save each surrogate model into specific folder
															
 
																         # 1. Data sets preparation (train and test)
															
@@ -139,32 +178,12 @@ class ILSMultiSurrogate(Algorithm):
 
																         if not os.path.exists(self._surrogates_file_path):
															
 
																             os.makedirs(self._surrogates_file_path)
															
 
																-        for i, indices in enumerate(self._k_indices):
															
 
																-
															
 
																-            current_learn = learn.copy()
															
 
																-            current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
															
 
																-
															
 
																-            current_test = test.copy()
															
 
																-            current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
															
 
																-
															
 
																-            problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
															
 
																-            model = Lasso(alpha=1e-5)
															
 
																-            surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
															
 
																-            analysis = FitterAnalysis(logfile=f"train_surrogate_{i}.log", problem=problem)
															
 
																-            algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
															
 
																-
															
 
																-            print(f"Start fitting again the surrogate model n°{i}")
															
 
																-            for r in range(10):
															
 
																-                print(f"Iteration n°{r}: for fitting surrogate n°{i}")
															
 
																-                algo.run_samples(learn=current_learn, test=current_test, step=10)
															
 
																+        num_cores = multiprocessing.cpu_count()
															
 
																-            # keep well ordered surrogate into file manager
															
 
																-            str_index = str(i)
															
 
																+        if not os.path.exists(self._output_log_surrogates):
															
 
																+            os.makedirs(self._output_log_surrogates)
															
 
																-            while len(str_index) < 6:
															
 
																-                str_index = "0" + str_index
															
 
																-
															
 
																-            joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
															
 
																+        Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, learn, test, indices) for index, indices in enumerate(self._k_indices))
															
 
																     def load_surrogates(self):
															
@@ -213,18 +232,19 @@ class ILSMultiSurrogate(Algorithm):
 
																             r_squared: {float} -- mean score of r_squred obtained from surrogate models
															
 
																         """
															
 
																-        r_squared_scores = []
															
 
																-
															
 
																         # for each indices set, get r^2 surrogate model and made prediction score
															
 
																-        for i, _ in enumerate(self._k_indices):
															
 
																-            r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
															
 
																-            r_squared_scores.append(r_squared)
															
 
																+        num_cores = multiprocessing.cpu_count()
															
 
																-        print(r_squared_scores)
															
 
																+        r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
															
 
																-        return sum(r_squared_scores) / len(r_squared_scores)
															
 
																+        # for i, _ in enumerate(self._k_indices):
															
 
																+        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
															
 
																+        #     r_squared_scores.append(r_squared)
															
 
																+        print(r_squared_scores)
															
 
																+
															
 
																+        return r_squared_scores
															
 
																     def add_to_surrogate(self, solution):
															
@@ -285,6 +305,8 @@ class ILSMultiSurrogate(Algorithm):
 
																             # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
															
 
																             # using randomly generated solutions (in order to cover seearch space)
															
 
																             while self._start_train_surrogates > self.getGlobalEvaluation():
															
 
																+
															
 
																+                print(f'Real solutions extraction {self.getGlobalEvaluation()} of {self._start_train_surrogates}')
															
 
																                 newSolution = self._initializer()
															
@@ -296,6 +318,10 @@ class ILSMultiSurrogate(Algorithm):
 
																                 self.increaseEvaluation()
															
 
																+        # stop this process after generating solution
															
 
																+        if self._generate_only:
															
 
																+            return self._bestSolution
															
 
																+
															
 
																         # train surrogate on real evaluated solutions file
															
 
																         self.train_surrogates()
															
 
																         self.load_surrogates()
															
@@ -344,9 +370,11 @@ class ILSMultiSurrogate(Algorithm):
 
																                 self.progress()
															
 
																             # check using specific dynamic criteria based on r^2
															
 
																-            r_squared = self.surrogates_coefficient_of_determination()
															
 
																+            r_squared_scores = self.surrogates_coefficient_of_determination()
															
 
																+            r_squared = sum(r_squared_scores) / len(r_squared_scores)
															
 
																+
															
 
																             training_surrogate_every = int(r_squared * self._ls_train_surrogates)
															
 
																-            print(f"=> R^2 of surrogate is of {r_squared}. Retraining model every {training_surrogate_every} LS")
															
 
																+            print(f"=> R² of surrogate is of {r_squared} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
															
 
																             # avoid issue when lauching every each local search
															
 
																             if training_surrogate_every <= 0:
															
@@ -365,13 +393,17 @@ class ILSMultiSurrogate(Algorithm):
 
																                 self.train_surrogates()
															
 
																                 training_time = time.time() - start_training
															
 
																-                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared, self.getGlobalMaxEvaluation(), self._n_local_search)
															
 
																+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, self.getGlobalMaxEvaluation(), self._total_n_local_search)
															
 
																                 # reload new surrogate function
															
 
																                 self.load_surrogates()
															
 
																+                # reinitialize number of local search
															
 
																+                self._n_local_search = 0
															
 
																+
															
 
																             # increase number of local search done
															
 
																             self._n_local_search += 1
															
 
																+            self._total_n_local_search += 1
															
 
																             self.information()
															
--- a/optimization/callbacks/SurrogateCheckpoint.py
+++ b/optimization/callbacks/SurrogateCheckpoint.py
@@ -48,7 +48,15 @@ class SurrogateCheckpoint(Callback):
 
																                 if index < solutionSize - 1:
															
 
																                     solutionData += ' '
															
 
																-            line = str(currentEvaluation) + ';' + str(surrogate_analyser._every_ls) + ';' + str(surrogate_analyser._time) + ';' + str(surrogate_analyser._r2) \
															
 
																+            r2_data = ""
															
 
																+            r2Size = len(surrogate_analyser._r2_scores)
															
 
																+            for index, val in enumerate(surrogate_analyser._r2_scores):
															
 
																+                r2_data += str(val)
															
 
																+
															
 
																+                if index < r2Size - 1:
															
 
																+                    r2_data += ' '
															
 
																+
															
 
																+            line = str(currentEvaluation) + ';' + str(surrogate_analyser._n_local_search) + ';' + str(surrogate_analyser._every_ls) + ';' + str(surrogate_analyser._time) + ';' + r2_data + ';' + str(surrogate_analyser._r2) \
															
 
																                 + ';' + solutionData + ';' + str(solution.fitness()) + ';\n'
															
 
																             # check if file exists
															
@@ -61,7 +69,28 @@ class SurrogateCheckpoint(Callback):
 
																     def load(self):
															
 
																         """
															
 
																-        Load nothing there, as we only log surrogate training information
															
 
																+        only load global n local search
															
 
																         """
															
 
																-        logging.info("No loading to do with surrogate checkpoint")
															
 
																+        if os.path.exists(self._filepath):
															
 
																+
															
 
																+            logging.info('Load n local search')
															
 
																+            with open(self._filepath) as f:
															
 
																+
															
 
																+                # get last line and read data
															
 
																+                lastline = f.readlines()[-1].replace(';\n', '')
															
 
																+                data = lastline.split(';')
															
 
																+
															
 
																+                n_local_search = int(data[1])
															
 
																+
															
 
																+                # set k_indices into main algorithm
															
 
																+                self._algo._total_n_local_search = n_local_search
															
 
																+
															
 
																+            print(macop_line())
															
 
																+            print(macop_text(f'SurrogateCheckpoint found from `{self._filepath}` file.'))
															
 
																+
															
 
																+        else:
															
 
																+            print(macop_text('No backup found...'))
															
 
																+            logging.info("Can't load Surrogate backup... Backup filepath not valid in SurrogateCheckpoint")
															
 
																+
															
 
																+        print(macop_line())
															
--- a/optimization/utils/SurrogateAnalysis.py
+++ b/optimization/utils/SurrogateAnalysis.py
@@ -1,9 +1,10 @@
 
																 # quick object for surrogate logging data
															
 
																 class SurrogateAnalysis():
															
 
																-    def __init__(self, time, every_ls, r2, evaluations, n_local_search):
															
 
																+    def __init__(self, time, every_ls, r2_scores, r2, evaluations, n_local_search):
															
 
																         self._time = time
															
 
																         self._every_ls = every_ls
															
 
																+        self._r2_scores = r2_scores
															
 
																         self._r2 = r2
															
 
																         self._evaluations = evaluations
															
 
																         self._n_local_search = n_local_search
															
--- a/run_openML_surrogate_multi.py
+++ b/run_openML_surrogate_multi.py
@@ -0,0 +1,102 @@
 
																+import os, argparse
															
 
																+import shutil
															
 
																+
															
 
																+open_ml_problems_folder = 'OpenML_datasets'
															
 
																+surrogate_data_path = 'data/surrogate/data/'
															
 
																+
															
 
																+k_params = [40, 30, 20]
															
 
																+k_random = [0, 1]
															
 
																+k_reinit = [0, 1]
															
 
																+every_ls = 50
															
 
																+
															
 
																+n_times = 5
															
 
																+
															
 
																+def main():
															
 
																+
															
 
																+    parser = argparse.ArgumentParser(description="Find best features for each OpenML problems")
															
 
																+
															
 
																+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
															
 
																+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
															
 
																+
															
 
																+    args = parser.parse_args()
															
 
																+
															
 
																+    p_ils = args.ils
															
 
																+    p_ls  = args.ls
															
 
																+
															
 
																+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
															
 
																+
															
 
																+    for ml_problem in open_ml_problems:
															
 
																+
															
 
																+        # for each problem prepare specific pre-computed real solution file
															
 
																+        ml_problem_name = ml_problem.replace('.csv', '')
															
 
																+        ml_problem_path = os.path.join(open_ml_problems_folder, ml_problem)
															
 
																+
															
 
																+        ml_surrogate_command = f"python find_best_attributes_surrogate_openML_multi.py " \
															
 
																+                               f"--data {ml_problem_path} " \
															
 
																+                               f"--ils {p_ils} " \
															
 
																+                               f"--ls {p_ls} " \
															
 
																+                               f"--output {ml_problem_name} " \
															
 
																+                               f"--generate_only 1"
															
 
																+        print(f'Running extraction real evaluations data for {ml_problem_name}')
															
 
																+        os.system(ml_surrogate_command)
															
 
																+
															
 
																+        real_evaluation_data_file_path = os.path.join(surrogate_data_path, ml_problem_name)
															
 
																+
															
 
																+        # for each multi param:
															
 
																+        # - copy precomputed real_evaluation_data_file
															
 
																+        # - run new instance using specific data
															
 
																+        for k in k_params:
															
 
																+            for k_r in k_random:
															
 
																+                for k_init in k_reinit:
															
 
																+
															
 
																+                    # if not use of k_reinit and use of random, then run multiple times this instance to do mean later
															
 
																+                    if k_init == 0 and k_r == 1:
															
 
																+
															
 
																+                        for i in range(n_times):
															
 
																+
															
 
																+                            str_index = str(i)
															
 
																+
															
 
																+                            while len(str_index) < 3:
															
 
																+                                str_index = "0" + str_index
															
 
																+
															
 
																+                            output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}_{str_index}'
															
 
																+
															
 
																+                            # copy pre-computed real evaluation data for this instance
															
 
																+                            current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
															
 
																+                            shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
															
 
																+
															
 
																+                            ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
															
 
																+                                            f"--data {ml_problem_path} " \
															
 
																+                                            f"--ils {p_ils} " \
															
 
																+                                            f"--ls {p_ls} " \
															
 
																+                                            f"--every_ls {every_ls} " \
															
 
																+                                            f"--k_division {k} " \
															
 
																+                                            f"--k_random {k_r} " \
															
 
																+                                            f"--k_dynamic {k_init} " \
															
 
																+                                            f"--output {output_problem_name}"
															
 
																+                            print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}, i: {i}]')
															
 
																+                            os.system(ml_surrogate_multi_command)
															
 
																+
															
 
																+                    else:
															
 
																+                        output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}'
															
 
																+
															
 
																+                        # copy pre-computed real evaluation data for this instance
															
 
																+                        current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
															
 
																+                        shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
															
 
																+
															
 
																+                        ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
															
 
																+                                        f"--data {ml_problem_path} " \
															
 
																+                                        f"--ils {p_ils} " \
															
 
																+                                        f"--ls {p_ls} " \
															
 
																+                                        f"--every_ls {every_ls} " \
															
 
																+                                        f"--k_division {k} " \
															
 
																+                                        f"--k_random {k_r} " \
															
 
																+                                        f"--k_dynamic {k_init} " \
															
 
																+                                        f"--output {output_problem_name}"
															
 
																+                        print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}]')
															
 
																+                        os.system(ml_surrogate_multi_command)
															
 
																+
															
 
																+
															
 
																+
															
 
																+if __name__ == "__main__":
															
 
																+    main()