Browse Source

Use surrogate from scract as proposed framework

Jérôme BUISINE 1 year ago
parent
commit
e4f5839e36
4 changed files with 156 additions and 46 deletions
  1. 5 1
      custom_config.py
  2. 36 18
      find_best_attributes_surrogate.py
  3. 38 27
      optimization/ILSSurrogate.py
  4. 77 0
      optimization/LSSurrogate.py

+ 5 - 1
custom_config.py

@@ -6,7 +6,6 @@ import os
 context_vars = vars()
 
 # folders
-
 output_data_folder              = 'data'
 output_data_generated           = os.path.join(output_data_folder, 'generated')
 output_datasets                 = os.path.join(output_data_folder, 'datasets')
@@ -15,6 +14,11 @@ output_models                   = os.path.join(output_data_folder, 'saved_models
 output_results_folder           = os.path.join(output_data_folder, 'results')
 output_logs_folder              = os.path.join(output_data_folder, 'logs')
 output_backup_folder            = os.path.join(output_data_folder, 'backups')
+output_surrogates_folder        = os.path.join(output_data_folder, 'surrogate')
+
+
+output_surrogates_model_folder  = os.path.join(output_surrogates_folder, 'models')
+output_surrogates_data_folder  = os.path.join(output_surrogates_folder, 'data')
 
 results_information_folder      = os.path.join(output_data_folder, 'results')
 

+ 36 - 18
find_best_attributes_surrogate.py

@@ -39,6 +39,8 @@ from macop.operators.policies.UCBPolicy import UCBPolicy
 from macop.callbacks.BasicCheckpoint import BasicCheckpoint
 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
 
+from sklearn.ensemble import RandomForestClassifier
+
 # variables and parameters
 models_list         = cfg.models_names_list
 
@@ -94,22 +96,22 @@ def main():
     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
 
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
-    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
+    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, default=models_list[0], required=False)
+    parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=1000)
     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
-    parser.add_argument('--surrogate', type=str, help='surrogate .joblib model to approximate fitness', required=True)
-    parser.add_argument('--solutions', type=str, help='solutions files required to find surrogate model', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--output', type=str, help='output surrogate model name')
 
     args = parser.parse_args()
 
     p_data_file = args.data
     p_choice    = args.choice
     p_length    = args.length
-    p_surrogate = args.surrogate
-    p_solutions = args.solutions
+    p_start     = args.start_surrogate
     p_ils_iteration = args.ils
     p_ls_iteration  = args.ls
+    p_output = args.output
 
     print(p_data_file)
 
@@ -120,8 +122,7 @@ def main():
     if not os.path.exists(cfg.output_logs_folder):
         os.makedirs(cfg.output_logs_folder)
 
-    _, data_file_name = os.path.split(p_data_file)
-    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
 
     # init solution (`n` attributes)
     def init():
@@ -129,7 +130,7 @@ def main():
         ).random(validator)
 
     # define evaluate function here (need of data information)
-    def evaluate(solution, use_surrogate=True):
+    def evaluate(solution):
 
         start = datetime.datetime.now()
 
@@ -146,7 +147,10 @@ def main():
         x_test_filters = x_test.iloc[:, indices]
         
         # TODO : use of GPU implementation of SVM
-        model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+
+        model = RandomForestClassifier(n_estimators=10)
+        model = model.fit(x_train_filters, y_train_filters)
         
         y_test_model = model.predict(x_test_filters)
         test_roc_auc = roc_auc_score(y_test, y_test_model)
@@ -160,28 +164,42 @@ def main():
         return test_roc_auc
 
 
-    backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
+    # build all output folder and files based on `output` name
+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
 
     if not os.path.exists(backup_model_folder):
         os.makedirs(backup_model_folder)
 
-    backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
-    ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
+    if not os.path.exists(cfg.output_surrogates_model_folder):
+        os.makedirs(cfg.output_surrogates_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_data_folder):
+        os.makedirs(cfg.output_surrogates_data_folder)
 
-    # prepare optimization algorithm
-    operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+
+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
+    operators = [SimpleBinaryMutation(), SimpleMutation()]
     policy = UCBPolicy(operators)
 
+    # define first line if necessary
+    if not os.path.exists(surrogate_output_data):
+        with open(surrogate_output_data) as f:
+            f.write('x;y\n')
+
     # custom ILS for surrogate use
     algo = ILSSurrogate(_initalizer=init, 
-                        _evaluator=None, # by default no evaluator, as we will use the surrogate function
+                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
                         _operators=operators, 
                         _policy=policy, 
                         _validator=validator,
-                        _surrogate_file_path=p_surrogate,
-                        _solutions_file=p_solutions,
+                        _surrogate_file_path=surrogate_output_model,
+                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
+                        _solutions_file=surrogate_output_data,
                         _ls_train_surrogate=1,
-                        _real_evaluator=evaluate,
                         _maximise=True)
     
     algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))

+ 38 - 27
optimization/ILSSurrogate.py

@@ -8,7 +8,7 @@ import joblib
 
 # module imports
 from macop.algorithms.Algorithm import Algorithm
-from macop.algorithms.mono.LocalSearch import LocalSearch
+from .LSSurrogate import LocalSearchSurrogate
 
 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
                                     LassoCV, ElasticNet)
@@ -33,10 +33,10 @@ class ILSSurrogate(Algorithm):
         bestSolution: {Solution} -- best solution found so far during running algorithm
         ls_iteration: {int} -- number of evaluation for each local search algorithm
         surrogate_file: {str} -- Surrogate model file to load (model trained using https://gitlab.com/florianlprt/wsao)
+        start_train_surrogate: {int} -- number of evaluation expected before start training and use surrogate
         surrogate: {Surrogate} -- Surrogate model instance loaded
         ls_train_surrogate: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
         solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
-        real_evaluator: {function} -- real expected evaluation to use
         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
     """
     def __init__(self,
@@ -46,21 +46,22 @@ class ILSSurrogate(Algorithm):
                  _policy,
                  _validator,
                  _surrogate_file_path,
+                 _start_train_surrogate,
                  _ls_train_surrogate,
                  _solutions_file,
-                 _real_evaluator,
                  _maximise=True,
                  _parent=None):
 
+        # set real evaluator as default
         super().__init__(_initalizer, _evaluator, _operators, _policy,
                 _validator, _maximise, _parent)
 
         self.n_local_search = 0
 
         self.surrogate_file_path = _surrogate_file_path
-        self.load_surrogate()
+        self.start_train_surrogate = _start_train_surrogate
 
-        self.real_evaluator = _real_evaluator
+        self.surrogate_evaluator = None
 
         self.ls_train_surrogate = _ls_train_surrogate
         self.solutions_file = _solutions_file
@@ -103,8 +104,26 @@ class ILSSurrogate(Algorithm):
         self.surrogate = joblib.load(self.surrogate_file_path)
 
         # update evaluator function
-        self.evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
+        self.surrogate_evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
 
+    def add_to_surrogate(self, solution):
+
+        # save real evaluated solution into specific file for surrogate
+        with open(self.solutions_file, 'a') as f:
+
+            line = ""
+
+            for index, e in enumerate(solution.data):
+
+                line += str(e)
+                
+                if index < len(solution.data) - 1:
+                    line += ","
+
+            line += ";"
+            line += str(solution.score)
+
+            f.write(line + "\n")
 
     def run(self, _evaluations, _ls_evaluations=100):
         """
@@ -124,16 +143,22 @@ class ILSSurrogate(Algorithm):
         # enable resuming for ILS
         self.resume()
 
+        if self.start_train_surrogate < self.getGlobalEvaluation():
+            self.load_surrogate()
+
         # initialize current solution
         self.initRun()
 
         # local search algorithm implementation
         while not self.stop():
+            
+            # set current evaluator based on used or not of surrogate function
+            current_evaluator = self.surrogate_evaluator if self.start_train_surrogate < self.getGlobalEvaluation() else self.evaluator
 
             # create new local search instance
             # passing global evaluation param from ILS
-            ls = LocalSearch(self.initializer,
-                         self.evaluator,
+            ls = LocalSearchSurrogate(self.initializer,
+                         current_evaluator,
                          self.operators,
                          self.policy,
                          self.validator,
@@ -147,12 +172,12 @@ class ILSSurrogate(Algorithm):
             # create and search solution from local search
             newSolution = ls.run(_ls_evaluations)
 
-            # if better solution than currently, replace it
-            if self.isBetter(newSolution):
+            # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
+            if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
 
                 # if better solution found from local search, retrained the found solution and test again
                 # without use of surrogate
-                fitness_score = self.real_evaluator(newSolution)
+                fitness_score = self.evaluator(newSolution)
                 self.increaseEvaluation()
 
                 newSolution.score = fitness_score
@@ -161,25 +186,11 @@ class ILSSurrogate(Algorithm):
                 if self.isBetter(newSolution):
                     self.bestSolution = newSolution
 
-                # save real evaluated solution into specific file for surrogate
-                with open(self.solutions_file, 'a') as f:
-
-                    line = ""
-
-                    for index, e in enumerate(newSolution.data):
-
-                        line += str(e)
-                        
-                        if index < len(newSolution.data) - 1:
-                            line += ","
-
-                    line += ";"
-                    line += str(newSolution.score)
+                self.add_to_surrogate(newSolution)
 
-                    f.write(line + "\n")
 
             # check if necessary or not to train again surrogate
-            if self.n_local_search % self.ls_train_surrogate == 0:
+            if self.n_local_search % self.ls_train_surrogate == 0 and self.start_train_surrogate < self.getGlobalEvaluation():
 
                 # train again surrogate on real evaluated solutions file
                 self.train_surrogate()

+ 77 - 0
optimization/LSSurrogate.py

@@ -0,0 +1,77 @@
+"""Local Search algorithm
+"""
+
+# main imports
+import logging
+
+# module imports
+from macop.algorithms.Algorithm import Algorithm
+
+
+class LocalSearchSurrogate(Algorithm):
+    """Local Search with surrogate used as exploitation optimization algorithm
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def run(self, _evaluations):
+        """
+        Run the local search algorithm
+
+        Args:
+            _evaluations: {int} -- number of Local search evaluations
+            
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(_evaluations)
+
+        if self.parent:
+            self.bestSolution = self.parent.bestSolution
+
+        # initialize current solution
+        self.initRun()
+
+        solutionSize = self.currentSolution.size
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            for _ in range(solutionSize):
+
+                # update current solution using policy
+                newSolution = self.update(self.currentSolution)
+
+                # if better solution than currently, replace it
+                if self.isBetter(newSolution):
+                    self.bestSolution = newSolution
+
+                # increase number of evaluations
+                self.increaseEvaluation()
+
+                self.progress()
+                logging.info("---- Current %s - SCORE %s" %
+                             (newSolution, newSolution.fitness()))
+
+                # add to surrogate pool file if necessary (using ILS parent reference)
+                if self.parent.start_train_surrogate >= self.getGlobalEvaluation():
+                    self.parent.add_to_surrogate(newSolution)
+
+                # stop algorithm if necessary
+                if self.stop():
+                    break
+
+        logging.info("End of %s, best solution found %s" %
+                     (type(self).__name__, self.bestSolution))
+
+        return self.bestSolution