5 anni fa · e4f5839e36
--- a/custom_config.py
+++ b/custom_config.py
@@ -6,7 +6,6 @@ import os
 
																 context_vars = vars()
															
 
																 # folders
															
 
																-
															
 
																 output_data_folder              = 'data'
															
 
																 output_data_generated           = os.path.join(output_data_folder, 'generated')
															
 
																 output_datasets                 = os.path.join(output_data_folder, 'datasets')
															
@@ -15,6 +14,11 @@ output_models                   = os.path.join(output_data_folder, 'saved_models
 
																 output_results_folder           = os.path.join(output_data_folder, 'results')
															
 
																 output_logs_folder              = os.path.join(output_data_folder, 'logs')
															
 
																 output_backup_folder            = os.path.join(output_data_folder, 'backups')
															
 
																+output_surrogates_folder        = os.path.join(output_data_folder, 'surrogate')
															
 
																+
															
 
																+
															
 
																+output_surrogates_model_folder  = os.path.join(output_surrogates_folder, 'models')
															
 
																+output_surrogates_data_folder  = os.path.join(output_surrogates_folder, 'data')
															
 
																 results_information_folder      = os.path.join(output_data_folder, 'results')
															
--- a/find_best_attributes_surrogate.py
+++ b/find_best_attributes_surrogate.py
@@ -39,6 +39,8 @@ from macop.operators.policies.UCBPolicy import UCBPolicy
 
																 from macop.callbacks.BasicCheckpoint import BasicCheckpoint
															
 
																 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
															
 
																+from sklearn.ensemble import RandomForestClassifier
															
 
																+
															
 
																 # variables and parameters
															
 
																 models_list         = cfg.models_names_list
															
@@ -94,22 +96,22 @@ def main():
 
																     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
															
 
																     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
															
 
																-    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
															
 
																+    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, default=models_list[0], required=False)
															
 
																+    parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=1000)
															
 
																     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
															
 
																-    parser.add_argument('--surrogate', type=str, help='surrogate .joblib model to approximate fitness', required=True)
															
 
																-    parser.add_argument('--solutions', type=str, help='solutions files required to find surrogate model', required=True)
															
 
																     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
															
 
																     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
															
 
																+    parser.add_argument('--output', type=str, help='output surrogate model name')
															
 
																     args = parser.parse_args()
															
 
																     p_data_file = args.data
															
 
																     p_choice    = args.choice
															
 
																     p_length    = args.length
															
 
																-    p_surrogate = args.surrogate
															
 
																-    p_solutions = args.solutions
															
 
																+    p_start     = args.start_surrogate
															
 
																     p_ils_iteration = args.ils
															
 
																     p_ls_iteration  = args.ls
															
 
																+    p_output = args.output
															
 
																     print(p_data_file)
															
@@ -120,8 +122,7 @@ def main():
 
																     if not os.path.exists(cfg.output_logs_folder):
															
 
																         os.makedirs(cfg.output_logs_folder)
															
 
																-    _, data_file_name = os.path.split(p_data_file)
															
 
																-    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
															
 
																+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
															
 
																     # init solution (`n` attributes)
															
 
																     def init():
															
@@ -129,7 +130,7 @@ def main():
 
																         ).random(validator)
															
 
																     # define evaluate function here (need of data information)
															
 
																-    def evaluate(solution, use_surrogate=True):
															
 
																+    def evaluate(solution):
															
 
																         start = datetime.datetime.now()
															
@@ -146,7 +147,10 @@ def main():
 
																         x_test_filters = x_test.iloc[:, indices]
															
 
																         # TODO : use of GPU implementation of SVM
															
 
																-        model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
															
 
																+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
															
 
																+
															
 
																+        model = RandomForestClassifier(n_estimators=10)
															
 
																+        model = model.fit(x_train_filters, y_train_filters)
															
 
																         y_test_model = model.predict(x_test_filters)
															
 
																         test_roc_auc = roc_auc_score(y_test, y_test_model)
															
@@ -160,28 +164,42 @@ def main():
 
																         return test_roc_auc
															
 
																-    backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
															
 
																+    # build all output folder and files based on `output` name
															
 
																+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
															
 
																+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
															
 
																+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
															
 
																     if not os.path.exists(backup_model_folder):
															
 
																         os.makedirs(backup_model_folder)
															
 
																-    backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
															
 
																-    ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
															
 
																+    if not os.path.exists(cfg.output_surrogates_model_folder):
															
 
																+        os.makedirs(cfg.output_surrogates_model_folder)
															
 
																+
															
 
																+    if not os.path.exists(cfg.output_surrogates_data_folder):
															
 
																+        os.makedirs(cfg.output_surrogates_data_folder)
															
 
																-    # prepare optimization algorithm
															
 
																-    operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
															
 
																+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
															
 
																+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
															
 
																+
															
 
																+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
															
 
																+    operators = [SimpleBinaryMutation(), SimpleMutation()]
															
 
																     policy = UCBPolicy(operators)
															
 
																+    # define first line if necessary
															
 
																+    if not os.path.exists(surrogate_output_data):
															
 
																+        with open(surrogate_output_data) as f:
															
 
																+            f.write('x;y\n')
															
 
																+
															
 
																     # custom ILS for surrogate use
															
 
																     algo = ILSSurrogate(_initalizer=init, 
															
 
																-                        _evaluator=None, # by default no evaluator, as we will use the surrogate function
															
 
																+                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
															
 
																                         _operators=operators, 
															
 
																                         _policy=policy, 
															
 
																                         _validator=validator,
															
 
																-                        _surrogate_file_path=p_surrogate,
															
 
																-                        _solutions_file=p_solutions,
															
 
																+                        _surrogate_file_path=surrogate_output_model,
															
 
																+                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
															
 
																+                        _solutions_file=surrogate_output_data,
															
 
																                         _ls_train_surrogate=1,
															
 
																-                        _real_evaluator=evaluate,
															
 
																                         _maximise=True)
															
 
																     algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
															
--- a/optimization/ILSSurrogate.py
+++ b/optimization/ILSSurrogate.py
@@ -8,7 +8,7 @@ import joblib
 
																 # module imports
															
 
																 from macop.algorithms.Algorithm import Algorithm
															
 
																-from macop.algorithms.mono.LocalSearch import LocalSearch
															
 
																+from .LSSurrogate import LocalSearchSurrogate
															
 
																 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
															
 
																                                     LassoCV, ElasticNet)
															
@@ -33,10 +33,10 @@ class ILSSurrogate(Algorithm):
 
																         bestSolution: {Solution} -- best solution found so far during running algorithm
															
 
																         ls_iteration: {int} -- number of evaluation for each local search algorithm
															
 
																         surrogate_file: {str} -- Surrogate model file to load (model trained using https://gitlab.com/florianlprt/wsao)
															
 
																+        start_train_surrogate: {int} -- number of evaluation expected before start training and use surrogate
															
 
																         surrogate: {Surrogate} -- Surrogate model instance loaded
															
 
																         ls_train_surrogate: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
															
 
																         solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
															
 
																-        real_evaluator: {function} -- real expected evaluation to use
															
 
																         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
															
 
																     """
															
 
																     def __init__(self,
															
@@ -46,21 +46,22 @@ class ILSSurrogate(Algorithm):
 
																                  _policy,
															
 
																                  _validator,
															
 
																                  _surrogate_file_path,
															
 
																+                 _start_train_surrogate,
															
 
																                  _ls_train_surrogate,
															
 
																                  _solutions_file,
															
 
																-                 _real_evaluator,
															
 
																                  _maximise=True,
															
 
																                  _parent=None):
															
 
																+        # set real evaluator as default
															
 
																         super().__init__(_initalizer, _evaluator, _operators, _policy,
															
 
																                 _validator, _maximise, _parent)
															
 
																         self.n_local_search = 0
															
 
																         self.surrogate_file_path = _surrogate_file_path
															
 
																-        self.load_surrogate()
															
 
																+        self.start_train_surrogate = _start_train_surrogate
															
 
																-        self.real_evaluator = _real_evaluator
															
 
																+        self.surrogate_evaluator = None
															
 
																         self.ls_train_surrogate = _ls_train_surrogate
															
 
																         self.solutions_file = _solutions_file
															
@@ -103,8 +104,26 @@ class ILSSurrogate(Algorithm):
 
																         self.surrogate = joblib.load(self.surrogate_file_path)
															
 
																         # update evaluator function
															
 
																-        self.evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
															
 
																+        self.surrogate_evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
															
 
																+    def add_to_surrogate(self, solution):
															
 
																+
															
 
																+        # save real evaluated solution into specific file for surrogate
															
 
																+        with open(self.solutions_file, 'a') as f:
															
 
																+
															
 
																+            line = ""
															
 
																+
															
 
																+            for index, e in enumerate(solution.data):
															
 
																+
															
 
																+                line += str(e)
															
 
																+                
															
 
																+                if index < len(solution.data) - 1:
															
 
																+                    line += ","
															
 
																+
															
 
																+            line += ";"
															
 
																+            line += str(solution.score)
															
 
																+
															
 
																+            f.write(line + "\n")
															
 
																     def run(self, _evaluations, _ls_evaluations=100):
															
 
																         """
															
@@ -124,16 +143,22 @@ class ILSSurrogate(Algorithm):
 
																         # enable resuming for ILS
															
 
																         self.resume()
															
 
																+        if self.start_train_surrogate < self.getGlobalEvaluation():
															
 
																+            self.load_surrogate()
															
 
																+
															
 
																         # initialize current solution
															
 
																         self.initRun()
															
 
																         # local search algorithm implementation
															
 
																         while not self.stop():
															
 
																+            
															
 
																+            # set current evaluator based on used or not of surrogate function
															
 
																+            current_evaluator = self.surrogate_evaluator if self.start_train_surrogate < self.getGlobalEvaluation() else self.evaluator
															
 
																             # create new local search instance
															
 
																             # passing global evaluation param from ILS
															
 
																-            ls = LocalSearch(self.initializer,
															
 
																-                         self.evaluator,
															
 
																+            ls = LocalSearchSurrogate(self.initializer,
															
 
																+                         current_evaluator,
															
 
																                          self.operators,
															
 
																                          self.policy,
															
 
																                          self.validator,
															
@@ -147,12 +172,12 @@ class ILSSurrogate(Algorithm):
 
																             # create and search solution from local search
															
 
																             newSolution = ls.run(_ls_evaluations)
															
 
																-            # if better solution than currently, replace it
															
 
																-            if self.isBetter(newSolution):
															
 
																+            # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
															
 
																+            if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
															
 
																                 # if better solution found from local search, retrained the found solution and test again
															
 
																                 # without use of surrogate
															
 
																-                fitness_score = self.real_evaluator(newSolution)
															
 
																+                fitness_score = self.evaluator(newSolution)
															
 
																                 self.increaseEvaluation()
															
 
																                 newSolution.score = fitness_score
															
@@ -161,25 +186,11 @@ class ILSSurrogate(Algorithm):
 
																                 if self.isBetter(newSolution):
															
 
																                     self.bestSolution = newSolution
															
 
																-                # save real evaluated solution into specific file for surrogate
															
 
																-                with open(self.solutions_file, 'a') as f:
															
 
																-
															
 
																-                    line = ""
															
 
																-
															
 
																-                    for index, e in enumerate(newSolution.data):
															
 
																-
															
 
																-                        line += str(e)
															
 
																-                        
															
 
																-                        if index < len(newSolution.data) - 1:
															
 
																-                            line += ","
															
 
																-
															
 
																-                    line += ";"
															
 
																-                    line += str(newSolution.score)
															
 
																+                self.add_to_surrogate(newSolution)
															
 
																-                    f.write(line + "\n")
															
 
																             # check if necessary or not to train again surrogate
															
 
																-            if self.n_local_search % self.ls_train_surrogate == 0:
															
 
																+            if self.n_local_search % self.ls_train_surrogate == 0 and self.start_train_surrogate < self.getGlobalEvaluation():
															
 
																                 # train again surrogate on real evaluated solutions file
															
 
																                 self.train_surrogate()
															
--- a/optimization/LSSurrogate.py
+++ b/optimization/LSSurrogate.py
@@ -0,0 +1,77 @@
 
																+"""Local Search algorithm
															
 
																+"""
															
 
																+
															
 
																+# main imports
															
 
																+import logging
															
 
																+
															
 
																+# module imports
															
 
																+from macop.algorithms.Algorithm import Algorithm
															
 
																+
															
 
																+
															
 
																+class LocalSearchSurrogate(Algorithm):
															
 
																+    """Local Search with surrogate used as exploitation optimization algorithm
															
 
																+
															
 
																+    Attributes:
															
 
																+        initalizer: {function} -- basic function strategy to initialize solution
															
 
																+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
															
 
																+        operators: {[Operator]} -- list of operator to use when launching algorithm
															
 
																+        policy: {Policy} -- Policy class implementation strategy to select operators
															
 
																+        validator: {function} -- basic function to check if solution is valid or not under some constraints
															
 
																+        maximise: {bool} -- specify kind of optimization problem 
															
 
																+        currentSolution: {Solution} -- current solution managed for current evaluation
															
 
																+        bestSolution: {Solution} -- best solution found so far during running algorithm
															
 
																+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
															
 
																+    """
															
 
																+    def run(self, _evaluations):
															
 
																+        """
															
 
																+        Run the local search algorithm
															
 
																+
															
 
																+        Args:
															
 
																+            _evaluations: {int} -- number of Local search evaluations
															
 
																+            
															
 
																+        Returns:
															
 
																+            {Solution} -- best solution found
															
 
																+        """
															
 
																+
															
 
																+        # by default use of mother method to initialize variables
															
 
																+        super().run(_evaluations)
															
 
																+
															
 
																+        if self.parent:
															
 
																+            self.bestSolution = self.parent.bestSolution
															
 
																+
															
 
																+        # initialize current solution
															
 
																+        self.initRun()
															
 
																+
															
 
																+        solutionSize = self.currentSolution.size
															
 
																+
															
 
																+        # local search algorithm implementation
															
 
																+        while not self.stop():
															
 
																+
															
 
																+            for _ in range(solutionSize):
															
 
																+
															
 
																+                # update current solution using policy
															
 
																+                newSolution = self.update(self.currentSolution)
															
 
																+
															
 
																+                # if better solution than currently, replace it
															
 
																+                if self.isBetter(newSolution):
															
 
																+                    self.bestSolution = newSolution
															
 
																+
															
 
																+                # increase number of evaluations
															
 
																+                self.increaseEvaluation()
															
 
																+
															
 
																+                self.progress()
															
 
																+                logging.info("---- Current %s - SCORE %s" %
															
 
																+                             (newSolution, newSolution.fitness()))
															
 
																+
															
 
																+                # add to surrogate pool file if necessary (using ILS parent reference)
															
 
																+                if self.parent.start_train_surrogate >= self.getGlobalEvaluation():
															
 
																+                    self.parent.add_to_surrogate(newSolution)
															
 
																+
															
 
																+                # stop algorithm if necessary
															
 
																+                if self.stop():
															
 
																+                    break
															
 
																+
															
 
																+        logging.info("End of %s, best solution found %s" %
															
 
																+                     (type(self).__name__, self.bestSolution))
															
 
																+
															
 
																+        return self.bestSolution