преди 5 години · e4f5839e36
--- a/custom_config.py
+++ b/custom_config.py
@@ -6,7 +6,6 @@ import os
 
				 context_vars = vars()
			
 
				 
			
 
				 # folders
			
 
				-
			
 
				 output_data_folder              = 'data'
			
 
				 output_data_generated           = os.path.join(output_data_folder, 'generated')
			
 
				 output_datasets                 = os.path.join(output_data_folder, 'datasets')
			
@@ -15,6 +14,11 @@ output_models                   = os.path.join(output_data_folder, 'saved_models
 
				 output_results_folder           = os.path.join(output_data_folder, 'results')
			
 
				 output_logs_folder              = os.path.join(output_data_folder, 'logs')
			
 
				 output_backup_folder            = os.path.join(output_data_folder, 'backups')
			
 
				+output_surrogates_folder        = os.path.join(output_data_folder, 'surrogate')
			
 
				+
			
 
				+
			
 
				+output_surrogates_model_folder  = os.path.join(output_surrogates_folder, 'models')
			
 
				+output_surrogates_data_folder  = os.path.join(output_surrogates_folder, 'data')
			
 
				 
			
 
				 results_information_folder      = os.path.join(output_data_folder, 'results')
			
 
				 
			
--- a/find_best_attributes_surrogate.py
+++ b/find_best_attributes_surrogate.py
@@ -39,6 +39,8 @@ from macop.operators.policies.UCBPolicy import UCBPolicy
 
				 from macop.callbacks.BasicCheckpoint import BasicCheckpoint
			
 
				 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
			
 
				 
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+
			
 
				 # variables and parameters
			
 
				 models_list         = cfg.models_names_list
			
 
				 
			
@@ -94,22 +96,22 @@ def main():
 
				     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
			
 
				 
			
 
				     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
			
 
				-    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
			
 
				+    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, default=models_list[0], required=False)
			
 
				+    parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=1000)
			
 
				     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
			
 
				-    parser.add_argument('--surrogate', type=str, help='surrogate .joblib model to approximate fitness', required=True)
			
 
				-    parser.add_argument('--solutions', type=str, help='solutions files required to find surrogate model', required=True)
			
 
				     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
			
 
				     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
			
 
				+    parser.add_argument('--output', type=str, help='output surrogate model name')
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     p_data_file = args.data
			
 
				     p_choice    = args.choice
			
 
				     p_length    = args.length
			
 
				-    p_surrogate = args.surrogate
			
 
				-    p_solutions = args.solutions
			
 
				+    p_start     = args.start_surrogate
			
 
				     p_ils_iteration = args.ils
			
 
				     p_ls_iteration  = args.ls
			
 
				+    p_output = args.output
			
 
				 
			
 
				     print(p_data_file)
			
 
				 
			
@@ -120,8 +122,7 @@ def main():
 
				     if not os.path.exists(cfg.output_logs_folder):
			
 
				         os.makedirs(cfg.output_logs_folder)
			
 
				 
			
 
				-    _, data_file_name = os.path.split(p_data_file)
			
 
				-    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
			
 
				+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
			
 
				 
			
 
				     # init solution (`n` attributes)
			
 
				     def init():
			
@@ -129,7 +130,7 @@ def main():
 
				         ).random(validator)
			
 
				 
			
 
				     # define evaluate function here (need of data information)
			
 
				-    def evaluate(solution, use_surrogate=True):
			
 
				+    def evaluate(solution):
			
 
				 
			
 
				         start = datetime.datetime.now()
			
 
				 
			
@@ -146,7 +147,10 @@ def main():
 
				         x_test_filters = x_test.iloc[:, indices]
			
 
				         
			
 
				         # TODO : use of GPU implementation of SVM
			
 
				-        model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
			
 
				+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
			
 
				+
			
 
				+        model = RandomForestClassifier(n_estimators=10)
			
 
				+        model = model.fit(x_train_filters, y_train_filters)
			
 
				         
			
 
				         y_test_model = model.predict(x_test_filters)
			
 
				         test_roc_auc = roc_auc_score(y_test, y_test_model)
			
@@ -160,28 +164,42 @@ def main():
 
				         return test_roc_auc
			
 
				 
			
 
				 
			
 
				-    backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
			
 
				+    # build all output folder and files based on `output` name
			
 
				+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
			
 
				+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
			
 
				+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
			
 
				 
			
 
				     if not os.path.exists(backup_model_folder):
			
 
				         os.makedirs(backup_model_folder)
			
 
				 
			
 
				-    backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
			
 
				-    ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
			
 
				+    if not os.path.exists(cfg.output_surrogates_model_folder):
			
 
				+        os.makedirs(cfg.output_surrogates_model_folder)
			
 
				+
			
 
				+    if not os.path.exists(cfg.output_surrogates_data_folder):
			
 
				+        os.makedirs(cfg.output_surrogates_data_folder)
			
 
				 
			
 
				-    # prepare optimization algorithm
			
 
				-    operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
			
 
				+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
			
 
				+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
			
 
				+
			
 
				+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
			
 
				+    operators = [SimpleBinaryMutation(), SimpleMutation()]
			
 
				     policy = UCBPolicy(operators)
			
 
				 
			
 
				+    # define first line if necessary
			
 
				+    if not os.path.exists(surrogate_output_data):
			
 
				+        with open(surrogate_output_data) as f:
			
 
				+            f.write('x;y\n')
			
 
				+
			
 
				     # custom ILS for surrogate use
			
 
				     algo = ILSSurrogate(_initalizer=init, 
			
 
				-                        _evaluator=None, # by default no evaluator, as we will use the surrogate function
			
 
				+                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
			
 
				                         _operators=operators, 
			
 
				                         _policy=policy, 
			
 
				                         _validator=validator,
			
 
				-                        _surrogate_file_path=p_surrogate,
			
 
				-                        _solutions_file=p_solutions,
			
 
				+                        _surrogate_file_path=surrogate_output_model,
			
 
				+                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
			
 
				+                        _solutions_file=surrogate_output_data,
			
 
				                         _ls_train_surrogate=1,
			
 
				-                        _real_evaluator=evaluate,
			
 
				                         _maximise=True)
			
 
				     
			
 
				     algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
			
--- a/optimization/ILSSurrogate.py
+++ b/optimization/ILSSurrogate.py
@@ -8,7 +8,7 @@ import joblib
 
				 
			
 
				 # module imports
			
 
				 from macop.algorithms.Algorithm import Algorithm
			
 
				-from macop.algorithms.mono.LocalSearch import LocalSearch
			
 
				+from .LSSurrogate import LocalSearchSurrogate
			
 
				 
			
 
				 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
			
 
				                                     LassoCV, ElasticNet)
			
@@ -33,10 +33,10 @@ class ILSSurrogate(Algorithm):
 
				         bestSolution: {Solution} -- best solution found so far during running algorithm
			
 
				         ls_iteration: {int} -- number of evaluation for each local search algorithm
			
 
				         surrogate_file: {str} -- Surrogate model file to load (model trained using https://gitlab.com/florianlprt/wsao)
			
 
				+        start_train_surrogate: {int} -- number of evaluation expected before start training and use surrogate
			
 
				         surrogate: {Surrogate} -- Surrogate model instance loaded
			
 
				         ls_train_surrogate: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
			
 
				         solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
			
 
				-        real_evaluator: {function} -- real expected evaluation to use
			
 
				         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
			
 
				     """
			
 
				     def __init__(self,
			
@@ -46,21 +46,22 @@ class ILSSurrogate(Algorithm):
 
				                  _policy,
			
 
				                  _validator,
			
 
				                  _surrogate_file_path,
			
 
				+                 _start_train_surrogate,
			
 
				                  _ls_train_surrogate,
			
 
				                  _solutions_file,
			
 
				-                 _real_evaluator,
			
 
				                  _maximise=True,
			
 
				                  _parent=None):
			
 
				 
			
 
				+        # set real evaluator as default
			
 
				         super().__init__(_initalizer, _evaluator, _operators, _policy,
			
 
				                 _validator, _maximise, _parent)
			
 
				 
			
 
				         self.n_local_search = 0
			
 
				 
			
 
				         self.surrogate_file_path = _surrogate_file_path
			
 
				-        self.load_surrogate()
			
 
				+        self.start_train_surrogate = _start_train_surrogate
			
 
				 
			
 
				-        self.real_evaluator = _real_evaluator
			
 
				+        self.surrogate_evaluator = None
			
 
				 
			
 
				         self.ls_train_surrogate = _ls_train_surrogate
			
 
				         self.solutions_file = _solutions_file
			
@@ -103,8 +104,26 @@ class ILSSurrogate(Algorithm):
 
				         self.surrogate = joblib.load(self.surrogate_file_path)
			
 
				 
			
 
				         # update evaluator function
			
 
				-        self.evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
			
 
				+        self.surrogate_evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
			
 
				 
			
 
				+    def add_to_surrogate(self, solution):
			
 
				+
			
 
				+        # save real evaluated solution into specific file for surrogate
			
 
				+        with open(self.solutions_file, 'a') as f:
			
 
				+
			
 
				+            line = ""
			
 
				+
			
 
				+            for index, e in enumerate(solution.data):
			
 
				+
			
 
				+                line += str(e)
			
 
				+                
			
 
				+                if index < len(solution.data) - 1:
			
 
				+                    line += ","
			
 
				+
			
 
				+            line += ";"
			
 
				+            line += str(solution.score)
			
 
				+
			
 
				+            f.write(line + "\n")
			
 
				 
			
 
				     def run(self, _evaluations, _ls_evaluations=100):
			
 
				         """
			
@@ -124,16 +143,22 @@ class ILSSurrogate(Algorithm):
 
				         # enable resuming for ILS
			
 
				         self.resume()
			
 
				 
			
 
				+        if self.start_train_surrogate < self.getGlobalEvaluation():
			
 
				+            self.load_surrogate()
			
 
				+
			
 
				         # initialize current solution
			
 
				         self.initRun()
			
 
				 
			
 
				         # local search algorithm implementation
			
 
				         while not self.stop():
			
 
				+            
			
 
				+            # set current evaluator based on used or not of surrogate function
			
 
				+            current_evaluator = self.surrogate_evaluator if self.start_train_surrogate < self.getGlobalEvaluation() else self.evaluator
			
 
				 
			
 
				             # create new local search instance
			
 
				             # passing global evaluation param from ILS
			
 
				-            ls = LocalSearch(self.initializer,
			
 
				-                         self.evaluator,
			
 
				+            ls = LocalSearchSurrogate(self.initializer,
			
 
				+                         current_evaluator,
			
 
				                          self.operators,
			
 
				                          self.policy,
			
 
				                          self.validator,
			
@@ -147,12 +172,12 @@ class ILSSurrogate(Algorithm):
 
				             # create and search solution from local search
			
 
				             newSolution = ls.run(_ls_evaluations)
			
 
				 
			
 
				-            # if better solution than currently, replace it
			
 
				-            if self.isBetter(newSolution):
			
 
				+            # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
			
 
				+            if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
			
 
				 
			
 
				                 # if better solution found from local search, retrained the found solution and test again
			
 
				                 # without use of surrogate
			
 
				-                fitness_score = self.real_evaluator(newSolution)
			
 
				+                fitness_score = self.evaluator(newSolution)
			
 
				                 self.increaseEvaluation()
			
 
				 
			
 
				                 newSolution.score = fitness_score
			
@@ -161,25 +186,11 @@ class ILSSurrogate(Algorithm):
 
				                 if self.isBetter(newSolution):
			
 
				                     self.bestSolution = newSolution
			
 
				 
			
 
				-                # save real evaluated solution into specific file for surrogate
			
 
				-                with open(self.solutions_file, 'a') as f:
			
 
				-
			
 
				-                    line = ""
			
 
				-
			
 
				-                    for index, e in enumerate(newSolution.data):
			
 
				-
			
 
				-                        line += str(e)
			
 
				-                        
			
 
				-                        if index < len(newSolution.data) - 1:
			
 
				-                            line += ","
			
 
				-
			
 
				-                    line += ";"
			
 
				-                    line += str(newSolution.score)
			
 
				+                self.add_to_surrogate(newSolution)
			
 
				 
			
 
				-                    f.write(line + "\n")
			
 
				 
			
 
				             # check if necessary or not to train again surrogate
			
 
				-            if self.n_local_search % self.ls_train_surrogate == 0:
			
 
				+            if self.n_local_search % self.ls_train_surrogate == 0 and self.start_train_surrogate < self.getGlobalEvaluation():
			
 
				 
			
 
				                 # train again surrogate on real evaluated solutions file
			
 
				                 self.train_surrogate()
			
--- a/optimization/LSSurrogate.py
+++ b/optimization/LSSurrogate.py
@@ -0,0 +1,77 @@
 
				+"""Local Search algorithm
			
 
				+"""
			
 
				+
			
 
				+# main imports
			
 
				+import logging
			
 
				+
			
 
				+# module imports
			
 
				+from macop.algorithms.Algorithm import Algorithm
			
 
				+
			
 
				+
			
 
				+class LocalSearchSurrogate(Algorithm):
			
 
				+    """Local Search with surrogate used as exploitation optimization algorithm
			
 
				+
			
 
				+    Attributes:
			
 
				+        initalizer: {function} -- basic function strategy to initialize solution
			
 
				+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
			
 
				+        operators: {[Operator]} -- list of operator to use when launching algorithm
			
 
				+        policy: {Policy} -- Policy class implementation strategy to select operators
			
 
				+        validator: {function} -- basic function to check if solution is valid or not under some constraints
			
 
				+        maximise: {bool} -- specify kind of optimization problem 
			
 
				+        currentSolution: {Solution} -- current solution managed for current evaluation
			
 
				+        bestSolution: {Solution} -- best solution found so far during running algorithm
			
 
				+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
			
 
				+    """
			
 
				+    def run(self, _evaluations):
			
 
				+        """
			
 
				+        Run the local search algorithm
			
 
				+
			
 
				+        Args:
			
 
				+            _evaluations: {int} -- number of Local search evaluations
			
 
				+            
			
 
				+        Returns:
			
 
				+            {Solution} -- best solution found
			
 
				+        """
			
 
				+
			
 
				+        # by default use of mother method to initialize variables
			
 
				+        super().run(_evaluations)
			
 
				+
			
 
				+        if self.parent:
			
 
				+            self.bestSolution = self.parent.bestSolution
			
 
				+
			
 
				+        # initialize current solution
			
 
				+        self.initRun()
			
 
				+
			
 
				+        solutionSize = self.currentSolution.size
			
 
				+
			
 
				+        # local search algorithm implementation
			
 
				+        while not self.stop():
			
 
				+
			
 
				+            for _ in range(solutionSize):
			
 
				+
			
 
				+                # update current solution using policy
			
 
				+                newSolution = self.update(self.currentSolution)
			
 
				+
			
 
				+                # if better solution than currently, replace it
			
 
				+                if self.isBetter(newSolution):
			
 
				+                    self.bestSolution = newSolution
			
 
				+
			
 
				+                # increase number of evaluations
			
 
				+                self.increaseEvaluation()
			
 
				+
			
 
				+                self.progress()
			
 
				+                logging.info("---- Current %s - SCORE %s" %
			
 
				+                             (newSolution, newSolution.fitness()))
			
 
				+
			
 
				+                # add to surrogate pool file if necessary (using ILS parent reference)
			
 
				+                if self.parent.start_train_surrogate >= self.getGlobalEvaluation():
			
 
				+                    self.parent.add_to_surrogate(newSolution)
			
 
				+
			
 
				+                # stop algorithm if necessary
			
 
				+                if self.stop():
			
 
				+                    break
			
 
				+
			
 
				+        logging.info("End of %s, best solution found %s" %
			
 
				+                     (type(self).__name__, self.bestSolution))
			
 
				+
			
 
				+        return self.bestSolution