Parcourir la source

add of run multi scripts using openML datasets

Jérôme BUISINE il y a 3 ans
Parent
commit
2cb283287a

+ 9 - 2
find_best_attributes_surrogate_openML_multi.py

@@ -117,10 +117,12 @@ def main():
 
     parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
     parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
-    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', required=True)
+    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
     parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
+    parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
     parser.add_argument('--output', type=str, help='output surrogate model name')
 
     args = parser.parse_args()
@@ -129,8 +131,10 @@ def main():
     p_every_ls   = args.every_ls
     p_k_division = args.k_division
     p_k_dynamic = bool(args.k_dynamic)
+    p_k_random = bool(args.k_random)
     p_ils_iteration = args.ils
     p_ls_iteration  = args.ls
+    p_generate_only = bool(args.generate_only)
     p_output = args.output
 
     # load data from file and get problem size
@@ -229,12 +233,15 @@ def main():
                         operators=operators, 
                         policy=policy, 
                         validator=validator,
+                        output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
                         surrogates_file_path=surrogate_output_model,
                         start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
                         solutions_file=surrogate_output_data,
                         ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
                         k_division=p_k_division,
                         k_dynamic=p_k_dynamic,
+                        k_random=p_k_random,
+                        generate_only=p_generate_only,
                         maximise=True)
     
     algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
@@ -253,7 +260,7 @@ def main():
 
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
 
-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness())
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
     

+ 70 - 38
optimization/ILSMultiSurrogate.py

@@ -11,6 +11,10 @@ import numpy as np
 import pandas as pd
 import random
 
+# parallel imports
+from joblib import Parallel, delayed
+import multiprocessing
+
 # module imports
 from macop.algorithms.Algorithm import Algorithm
 from .LSSurrogate import LocalSearchSurrogate
@@ -38,13 +42,15 @@ class ILSMultiSurrogate(Algorithm):
         currentSolution: {Solution} -- current solution managed for current evaluation
         bestSolution: {Solution} -- best solution found so far during running algorithm
         ls_iteration: {int} -- number of evaluation for each local search algorithm
-        surrogates_file: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
+        surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
+        output_log_surrogates: {str} -- Log folder for surrogates training model
         start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
         surrogates: [{Surrogate}] -- Surrogates model instance loaded
         ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
         k_division: {int} -- number of expected division for current features problem
         k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
         k_random: {bool} -- random initialization of k_indices for each surrogate features model data
+        generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
         solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
     """
@@ -55,12 +61,14 @@ class ILSMultiSurrogate(Algorithm):
                  policy,
                  validator,
                  surrogates_file_path,
+                 output_log_surrogates,
                  start_train_surrogates,
                  ls_train_surrogates,
                  k_division,
                  solutions_file,
                  k_random=True,
                  k_dynamic=False,
+                 generate_only=False,
                  maximise=True,
                  parent=None):
 
@@ -69,10 +77,12 @@ class ILSMultiSurrogate(Algorithm):
                 validator, maximise, parent)
 
         self._n_local_search = 0
+        self._total_n_local_search = 0
         self._main_evaluator = evaluator
 
         self._surrogates_file_path = surrogates_file_path
         self._start_train_surrogates = start_train_surrogates
+        self._output_log_surrogates = output_log_surrogates
 
         self._surrogate_evaluator = None
         self._surrogate_analyser = None
@@ -86,6 +96,8 @@ class ILSMultiSurrogate(Algorithm):
         self._k_indices = None
         self._surrogates = None
 
+        self._generate_only = generate_only
+
     def init_k_split_indices(self):
         """Initialize k_indices for the new training of surrogate
 
@@ -101,6 +113,36 @@ class ILSMultiSurrogate(Algorithm):
         splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
 
         return splitted_indices
+
+
+    def train_surrogate(self, index, learn, test, indices):
+
+        current_learn = learn.copy()
+        current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
+
+        current_test = test.copy()
+        current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
+
+        problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
+        model = Lasso(alpha=1e-5)
+        surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
+        analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+        print(f"Start fitting again the surrogate model n°{index}")
+        for r in range(10):
+            print(f"Iteration n°{r}: for fitting surrogate n°{index}")
+            algo.run_samples(learn=current_learn, test=current_test, step=10)
+
+        # keep well ordered surrogate into file manager
+        str_index = str(index)
+
+        while len(str_index) < 6:
+            str_index = "0" + str_index
+
+        joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
+
+        return str_index
         
 
     def train_surrogates(self):
@@ -115,9 +157,6 @@ class ILSMultiSurrogate(Algorithm):
         #        sample=1000,step=10 \
         #        analysis=fitter,logfile=out_fit.csv
 
-        # TODO : pass run samples directly using train and test
-        # TODO : use of multiprocessing commands for each surrogate
-        # TODO : save each surrogate model into specific folder
 
         # 1. Data sets preparation (train and test)
         
@@ -139,32 +178,12 @@ class ILSMultiSurrogate(Algorithm):
         if not os.path.exists(self._surrogates_file_path):
             os.makedirs(self._surrogates_file_path)
 
-        for i, indices in enumerate(self._k_indices):
-
-            current_learn = learn.copy()
-            current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
-
-            current_test = test.copy()
-            current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
-
-            problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
-            model = Lasso(alpha=1e-5)
-            surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
-            analysis = FitterAnalysis(logfile=f"train_surrogate_{i}.log", problem=problem)
-            algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
-
-            print(f"Start fitting again the surrogate model n°{i}")
-            for r in range(10):
-                print(f"Iteration n°{r}: for fitting surrogate n°{i}")
-                algo.run_samples(learn=current_learn, test=current_test, step=10)
+        num_cores = multiprocessing.cpu_count()
 
-            # keep well ordered surrogate into file manager
-            str_index = str(i)
+        if not os.path.exists(self._output_log_surrogates):
+            os.makedirs(self._output_log_surrogates)
 
-            while len(str_index) < 6:
-                str_index = "0" + str_index
-
-            joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
+        Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, learn, test, indices) for index, indices in enumerate(self._k_indices))
 
 
     def load_surrogates(self):
@@ -213,18 +232,19 @@ class ILSMultiSurrogate(Algorithm):
             r_squared: {float} -- mean score of r_squred obtained from surrogate models
         """
 
-        r_squared_scores = []
-
         # for each indices set, get r^2 surrogate model and made prediction score
-        for i, _ in enumerate(self._k_indices):
 
-            r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
-            r_squared_scores.append(r_squared)
+        num_cores = multiprocessing.cpu_count()
 
-        print(r_squared_scores)
+        r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
 
-        return sum(r_squared_scores) / len(r_squared_scores)
+        # for i, _ in enumerate(self._k_indices):
+        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
+        #     r_squared_scores.append(r_squared)
 
+        print(r_squared_scores)
+
+        return r_squared_scores
 
 
     def add_to_surrogate(self, solution):
@@ -285,6 +305,8 @@ class ILSMultiSurrogate(Algorithm):
             # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
             # using randomly generated solutions (in order to cover seearch space)
             while self._start_train_surrogates > self.getGlobalEvaluation():
+
+                print(f'Real solutions extraction {self.getGlobalEvaluation()} of {self._start_train_surrogates}')
                 
                 newSolution = self._initializer()
 
@@ -296,6 +318,10 @@ class ILSMultiSurrogate(Algorithm):
 
                 self.increaseEvaluation()
 
+        # stop this process after generating solution
+        if self._generate_only:
+            return self._bestSolution
+
         # train surrogate on real evaluated solutions file
         self.train_surrogates()
         self.load_surrogates()
@@ -344,9 +370,11 @@ class ILSMultiSurrogate(Algorithm):
                 self.progress()
 
             # check using specific dynamic criteria based on r^2
-            r_squared = self.surrogates_coefficient_of_determination()
+            r_squared_scores = self.surrogates_coefficient_of_determination()
+            r_squared = sum(r_squared_scores) / len(r_squared_scores)
+
             training_surrogate_every = int(r_squared * self._ls_train_surrogates)
-            print(f"=> R^2 of surrogate is of {r_squared}. Retraining model every {training_surrogate_every} LS")
+            print(f"=> R² of surrogate is of {r_squared} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
 
             # avoid issue when lauching every each local search
             if training_surrogate_every <= 0:
@@ -365,13 +393,17 @@ class ILSMultiSurrogate(Algorithm):
                 self.train_surrogates()
                 training_time = time.time() - start_training
 
-                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared, self.getGlobalMaxEvaluation(), self._n_local_search)
+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, self.getGlobalMaxEvaluation(), self._total_n_local_search)
 
                 # reload new surrogate function
                 self.load_surrogates()
 
+                # reinitialize number of local search
+                self._n_local_search = 0
+
             # increase number of local search done
             self._n_local_search += 1
+            self._total_n_local_search += 1
 
             self.information()
 

+ 32 - 3
optimization/callbacks/SurrogateCheckpoint.py

@@ -48,7 +48,15 @@ class SurrogateCheckpoint(Callback):
                 if index < solutionSize - 1:
                     solutionData += ' '
 
-            line = str(currentEvaluation) + ';' + str(surrogate_analyser._every_ls) + ';' + str(surrogate_analyser._time) + ';' + str(surrogate_analyser._r2) \
+            r2_data = ""
+            r2Size = len(surrogate_analyser._r2_scores)
+            for index, val in enumerate(surrogate_analyser._r2_scores):
+                r2_data += str(val)
+
+                if index < r2Size - 1:
+                    r2_data += ' '
+
+            line = str(currentEvaluation) + ';' + str(surrogate_analyser._n_local_search) + ';' + str(surrogate_analyser._every_ls) + ';' + str(surrogate_analyser._time) + ';' + r2_data + ';' + str(surrogate_analyser._r2) \
                 + ';' + solutionData + ';' + str(solution.fitness()) + ';\n'
 
             # check if file exists
@@ -61,7 +69,28 @@ class SurrogateCheckpoint(Callback):
 
     def load(self):
         """
-        Load nothing there, as we only log surrogate training information
+        only load global n local search
         """
 
-        logging.info("No loading to do with surrogate checkpoint")
+        if os.path.exists(self._filepath):
+
+            logging.info('Load n local search')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1].replace(';\n', '')
+                data = lastline.split(';')
+
+                n_local_search = int(data[1])
+
+                # set k_indices into main algorithm
+                self._algo._total_n_local_search = n_local_search
+
+            print(macop_line())
+            print(macop_text(f'SurrogateCheckpoint found from `{self._filepath}` file.'))
+
+        else:
+            print(macop_text('No backup found...'))
+            logging.info("Can't load Surrogate backup... Backup filepath not valid in SurrogateCheckpoint")
+
+        print(macop_line())

+ 2 - 1
optimization/utils/SurrogateAnalysis.py

@@ -1,9 +1,10 @@
 # quick object for surrogate logging data
 class SurrogateAnalysis():
 
-    def __init__(self, time, every_ls, r2, evaluations, n_local_search):
+    def __init__(self, time, every_ls, r2_scores, r2, evaluations, n_local_search):
         self._time = time
         self._every_ls = every_ls
+        self._r2_scores = r2_scores
         self._r2 = r2
         self._evaluations = evaluations
         self._n_local_search = n_local_search

+ 102 - 0
run_openML_surrogate_multi.py

@@ -0,0 +1,102 @@
+import os, argparse
+import shutil
+
+open_ml_problems_folder = 'OpenML_datasets'
+surrogate_data_path = 'data/surrogate/data/'
+
+k_params = [40, 30, 20]
+k_random = [0, 1]
+k_reinit = [0, 1]
+every_ls = 50
+
+n_times = 5
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Find best features for each OpenML problems")
+
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+
+    args = parser.parse_args()
+
+    p_ils = args.ils
+    p_ls  = args.ls
+
+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
+
+    for ml_problem in open_ml_problems:
+
+        # for each problem prepare specific pre-computed real solution file
+        ml_problem_name = ml_problem.replace('.csv', '')
+        ml_problem_path = os.path.join(open_ml_problems_folder, ml_problem)
+
+        ml_surrogate_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                               f"--data {ml_problem_path} " \
+                               f"--ils {p_ils} " \
+                               f"--ls {p_ls} " \
+                               f"--output {ml_problem_name} " \
+                               f"--generate_only 1"
+        print(f'Running extraction real evaluations data for {ml_problem_name}')
+        os.system(ml_surrogate_command)
+
+        real_evaluation_data_file_path = os.path.join(surrogate_data_path, ml_problem_name)
+
+        # for each multi param:
+        # - copy precomputed real_evaluation_data_file
+        # - run new instance using specific data
+        for k in k_params:
+            for k_r in k_random:
+                for k_init in k_reinit:
+
+                    # if not use of k_reinit and use of random, then run multiple times this instance to do mean later
+                    if k_init == 0 and k_r == 1:
+
+                        for i in range(n_times):
+
+                            str_index = str(i)
+
+                            while len(str_index) < 3:
+                                str_index = "0" + str_index
+
+                            output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}_{str_index}'
+
+                            # copy pre-computed real evaluation data for this instance
+                            current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                            shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                            ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                                            f"--data {ml_problem_path} " \
+                                            f"--ils {p_ils} " \
+                                            f"--ls {p_ls} " \
+                                            f"--every_ls {every_ls} " \
+                                            f"--k_division {k} " \
+                                            f"--k_random {k_r} " \
+                                            f"--k_dynamic {k_init} " \
+                                            f"--output {output_problem_name}"
+                            print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}, i: {i}]')
+                            os.system(ml_surrogate_multi_command)
+
+                    else:
+                        output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}'
+
+                        # copy pre-computed real evaluation data for this instance
+                        current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                        shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                        ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                                        f"--data {ml_problem_path} " \
+                                        f"--ils {p_ils} " \
+                                        f"--ls {p_ls} " \
+                                        f"--every_ls {every_ls} " \
+                                        f"--k_division {k} " \
+                                        f"--k_random {k_r} " \
+                                        f"--k_dynamic {k_init} " \
+                                        f"--output {output_problem_name}"
+                        print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}]')
+                        os.system(ml_surrogate_multi_command)
+
+
+
+if __name__ == "__main__":
+    main()