Browse Source

use of wsao module for accelerate ILS (using surrogate)

Jérôme BUISINE 1 month ago
parent
commit
2fc4db3bfb
9 changed files with 277 additions and 53 deletions
  1. 4 1
      .gitmodules
  2. 2 1
      custom_config.py
  3. 16 15
      find_best_attributes_26.py
  4. 39 20
      find_best_attributes_30.py
  5. 16 16
      models.py
  6. 199 0
      optimization/ILSSurrogate.py
  7. 0 0
      optimization/__init__.py
  8. 0 0
      utils/extract_solution_log.py
  9. 1 0
      wsao

+ 4 - 1
.gitmodules

@@ -1,3 +1,6 @@
 [submodule "modules"]
 	path = modules
-	url = https://github.com/prise-3d/Thesis-CommonModules.git
+	url = https://github.com/prise-3d/Thesis-CommonModules.git
+[submodule "wsao"]
+	path = wsao
+	url = https://gitlab.com/jbuisine/wsao.git

+ 2 - 1
custom_config.py

@@ -27,7 +27,8 @@ optimization_filters_result_filename    = 'optimization_comparisons_filters.csv'
 optimization_attributes_result_filename = 'optimization_comparisons_attributes.csv'
 
 filter_reduction_choices                = ['attributes', 'filters']
-models_names_list                       = ["svm_model","ensemble_model","ensemble_model_v2","deep_keras", "svm_gpu"]
+# models_names_list                       = ["svm_model","ensemble_model","ensemble_model_v2","deep_keras", "svm_gpu"]
+models_names_list                       = ["svm_model","ensemble_model","ensemble_model_v2"]
 
 ## models_names_list               = ["svm_model","ensemble_model","ensemble_model_v2","deep_keras"]
 ## normalization_choices           = ['svd', 'svdn', 'svdne']

+ 16 - 15
find_best_attributes_26.py

@@ -40,9 +40,6 @@ from macop.callbacks.UCBCheckpoint import UCBCheckpoint
 
 # variables and parameters
 models_list         = cfg.models_names_list
-number_of_values    = 26
-ils_iteration       = 4000
-ls_iteration        = 10
 
 # default validator
 def validator(solution):
@@ -96,16 +93,17 @@ def main():
 
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
     parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
-    parser.add_argument('--length', type=str, help='max data length (need to be specify for evaluator)', required=True)
+    parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
 
     args = parser.parse_args()
 
     p_data_file = args.data
     p_choice    = args.choice
     p_length    = args.length
-
-    global number_of_values
-    number_of_values = p_length
+    p_ils_iteration = args.ils
+    p_ls_iteration  = args.ls
 
     print(p_data_file)
 
@@ -116,11 +114,12 @@ def main():
     if not os.path.exists(cfg.output_logs_folder):
         os.makedirs(cfg.output_logs_folder)
 
-    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
+    _, data_file_name = os.path.split(p_data_file)
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
 
     # init solution (`n` attributes)
     def init():
-        return BinarySolution([], 26
+        return BinarySolution([], p_length
         ).random(validator)
 
     # define evaluate function here (need of data information)
@@ -153,11 +152,13 @@ def main():
 
         return test_roc_auc
 
-    if not os.path.exists(cfg.output_backup_folder):
-        os.makedirs(cfg.output_backup_folder)
+    backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
+
+    if not os.path.exists(backup_model_folder):
+        os.makedirs(backup_model_folder)
 
-    backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '.csv')
-    ucb_backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '_ucbPolicy.csv')
+    backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
 
     # prepare optimization algorithm
     operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
@@ -168,7 +169,7 @@ def main():
     algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
     algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
 
-    bestSol = algo.run(ils_iteration, ls_iteration)
+    bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 
     # print best solution found
     print("Found ", bestSol)
@@ -189,7 +190,7 @@ def main():
                 filters_counter += 1
 
 
-    line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
     

+ 39 - 20
find_best_attributes_30.py

@@ -6,6 +6,7 @@ import pandas as pd
 import numpy as np
 import logging
 import datetime
+import random
 
 # model imports
 from sklearn.model_selection import train_test_split
@@ -25,7 +26,7 @@ sys.path.insert(0, '') # trick to enable import of main folder module
 import custom_config as cfg
 import models as mdl
 
-from macop.algorithms.mono.IteratedLocalSearch import IteratedLocalSearch as ILS
+from optimization.ILSSurrogate import ILSSurrogate
 from macop.solutions.BinarySolution import BinarySolution
 
 from macop.operators.mutators.SimpleMutation import SimpleMutation
@@ -40,13 +41,11 @@ from macop.callbacks.UCBCheckpoint import UCBCheckpoint
 
 # variables and parameters
 models_list         = cfg.models_names_list
-number_of_values    = 30
-ils_iteration       = 4000
-ls_iteration        = 10
 
 # default validator
 def validator(solution):
 
+    # at least 5 attributes
     if list(solution.data).count(1) < 5:
         return False
 
@@ -96,16 +95,21 @@ def main():
 
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
     parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, required=True)
-    parser.add_argument('--length', type=str, help='max data length (need to be specify for evaluator)', required=True)
+    parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
+    parser.add_argument('--surrogate', type=str, help='surrogate .joblib model to approximate fitness', required=True)
+    parser.add_argument('--solutions', type=str, help='solutions files required to find surrogate model', required=True)
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
 
     args = parser.parse_args()
 
     p_data_file = args.data
     p_choice    = args.choice
     p_length    = args.length
-
-    global number_of_values
-    number_of_values = p_length
+    p_surrogate = args.surrogate
+    p_solutions = args.solutions
+    p_ils_iteration = args.ils
+    p_ls_iteration  = args.ls
 
     print(p_data_file)
 
@@ -116,17 +120,19 @@ def main():
     if not os.path.exists(cfg.output_logs_folder):
         os.makedirs(cfg.output_logs_folder)
 
-    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
+    _, data_file_name = os.path.split(p_data_file)
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(data_file_name), level=logging.DEBUG)
 
     # init solution (`n` attributes)
     def init():
-        return BinarySolution([], 30
+        return BinarySolution([], p_length
         ).random(validator)
 
     # define evaluate function here (need of data information)
-    def evaluate(solution):
+    def evaluate(solution, use_surrogate=True):
 
         start = datetime.datetime.now()
+
         # get indices of filters data to use (filters selection from solution)
         indices = []
 
@@ -138,7 +144,7 @@ def main():
         x_train_filters = x_train.iloc[:, indices]
         y_train_filters = y_train
         x_test_filters = x_test.iloc[:, indices]
-
+        
         # TODO : use of GPU implementation of SVM
         model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
         
@@ -149,26 +155,39 @@ def main():
 
         diff = end - start
 
-        print("Evaluation took :", divmod(diff.days * 86400 + diff.seconds, 60))
+        print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
 
         return test_roc_auc
 
-    if not os.path.exists(cfg.output_backup_folder):
-        os.makedirs(cfg.output_backup_folder)
 
-    backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '.csv')
-    ucb_backup_file_path = os.path.join(cfg.output_backup_folder, p_data_file.split('/')[-1] + '_ucbPolicy.csv')
+    backup_model_folder = os.path.join(cfg.output_backup_folder, data_file_name)
+
+    if not os.path.exists(backup_model_folder):
+        os.makedirs(backup_model_folder)
+
+    backup_file_path = os.path.join(backup_model_folder, data_file_name + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, data_file_name + '_ucbPolicy.csv')
 
     # prepare optimization algorithm
     operators = [SimpleBinaryMutation(), SimpleMutation(), SimpleCrossover(), RandomSplitCrossover()]
     policy = UCBPolicy(operators)
 
-    algo = ILS(init, evaluate, operators, policy, validator, True)
+    # custom ILS for surrogate use
+    algo = ILSSurrogate(_initalizer=init, 
+                        _evaluator=None, # by default no evaluator, as we will use the surrogate function
+                        _operators=operators, 
+                        _policy=policy, 
+                        _validator=validator,
+                        _surrogate_file_path=p_surrogate,
+                        _solutions_file=p_solutions,
+                        _ls_train_surrogate=1,
+                        _real_evaluator=evaluate,
+                        _maximise=True)
     
     algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
     algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
 
-    bestSol = algo.run(ils_iteration, ls_iteration)
+    bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 
     # print best solution found
     print("Found ", bestSol)
@@ -189,7 +208,7 @@ def main():
                 filters_counter += 1
 
 
-    line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
     

+ 16 - 16
models.py

@@ -9,12 +9,12 @@ from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.feature_selection import RFECV
 import sklearn.svm as svm
 from sklearn.metrics import accuracy_score
-from thundersvm import SVC
+# from thundersvm import SVC
 from sklearn.model_selection import KFold, cross_val_score
             
 
 # variables and parameters
-n_predict = 0
+# n_predict = 0
 
 # def my_accuracy_scorer(*args):
 #         global n_predict
@@ -44,25 +44,25 @@ def svm_model(X_train, y_train):
     return _get_best_model(X_train, y_train)
 
 
-def _get_best_gpu_model(X_train, y_train):
+# def _get_best_gpu_model(X_train, y_train):
 
-    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-    gammas = [0.001, 0.01, 0.1, 5, 10, 100]
-    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+#     Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+#     gammas = [0.001, 0.01, 0.1, 5, 10, 100]
+#     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
 
-    svc = SVC(probability=True, class_weight='balanced')
-    #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
-    clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)
+#     svc = svm.SVC(probability=True, class_weight='balanced')
+#     #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
+#     clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)
 
-    clf.fit(X_train, y_train)
+#     clf.fit(X_train, y_train)
 
-    model = clf.best_estimator_
+#     model = clf.best_estimator_
 
-    return model
+#     return model
 
-def svm_gpu(X_train, y_train):
+# def svm_gpu(X_train, y_train):
 
-    return _get_best_gpu_model(X_train, y_train)
+#     return _get_best_gpu_model(X_train, y_train)
 
 
 def ensemble_model(X_train, y_train):
@@ -107,8 +107,8 @@ def get_trained_model(choice, X_train, y_train):
     if choice == 'svm_model':
         return svm_model(X_train, y_train)
 
-    if choice == 'svm_gpu':
-        return svm_gpu(X_train, y_train)
+    # if choice == 'svm_gpu':
+        # return svm_gpu(X_train, y_train)
 
     if choice == 'ensemble_model':
         return ensemble_model(X_train, y_train)

+ 199 - 0
optimization/ILSSurrogate.py

@@ -0,0 +1,199 @@
+"""Iterated Local Search Algorithm implementation using surrogate as fitness approximation
+"""
+
+# main imports
+import os
+import logging
+import joblib
+
+# module imports
+from macop.algorithms.Algorithm import Algorithm
+from macop.algorithms.mono.LocalSearch import LocalSearch
+
+from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
+                                    LassoCV, ElasticNet)
+
+from wsao.sao.problems.nd3dproblem import ND3DProblem
+from wsao.sao.surrogates.walsh import WalshSurrogate
+from wsao.sao.algos.fitter import FitterAlgo
+from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
+
+class ILSSurrogate(Algorithm):
+    """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
+
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        ls_iteration: {int} -- number of evaluation for each local search algorithm
+        surrogate_file: {str} -- Surrogate model file to load (model trained using https://gitlab.com/florianlprt/wsao)
+        surrogate: {Surrogate} -- Surrogate model instance loaded
+        ls_train_surrogate: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
+        solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
+        real_evaluator: {function} -- real expected evaluation to use
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def __init__(self,
+                 _initalizer,
+                 _evaluator,
+                 _operators,
+                 _policy,
+                 _validator,
+                 _surrogate_file_path,
+                 _ls_train_surrogate,
+                 _solutions_file,
+                 _real_evaluator,
+                 _maximise=True,
+                 _parent=None):
+
+        super().__init__(_initalizer, _evaluator, _operators, _policy,
+                _validator, _maximise, _parent)
+
+        self.n_local_search = 0
+
+        self.surrogate_file_path = _surrogate_file_path
+        self.load_surrogate()
+
+        self.real_evaluator = _real_evaluator
+
+        self.ls_train_surrogate = _ls_train_surrogate
+        self.solutions_file = _solutions_file
+
+    def train_surrogate(self):
+        """etrain if necessary the whole surrogate fitness approximation function
+        """
+        # Following https://gitlab.com/florianlprt/wsao, we re-train the model
+        # ---------------------------------------------------------------------------
+        # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
+        #        model=lasso,alpha=1e-5 \
+        #        surrogate=walsh,order=3 \
+        #        algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
+        #        sample=1000,step=10 \
+        #        analysis=fitter,logfile=out_fit.csv
+
+        problem = ND3DProblem(size=len(self.bestSolution.data)) # problem size based on best solution size (need to improve...)
+        model = Lasso(alpha=1e-5)
+        surrogate = WalshSurrogate(order=3, size=problem.size, model=model)
+        analysis = FitterAnalysis(logfile="train_surrogate.log", problem=problem)
+
+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+        print("Start fitting again the surrogate model")
+        for r in range(10):
+            print("Iteration n°{0}: for fitting surrogate".format(r))
+            algo.run(samplefile=self.solutions_file, sample=100, step=10)
+
+        joblib.dump(algo, self.surrogate_file_path)
+
+
+    def load_surrogate(self):
+        """Load algorithm with surrogate model and create lambda evaluator function
+        """
+
+        # need to first train surrogate if not exist
+        if not os.path.exists(self.surrogate_file_path):
+            self.train_surrogate()
+
+        self.surrogate = joblib.load(self.surrogate_file_path)
+
+        # update evaluator function
+        self.evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
+
+
+    def run(self, _evaluations, _ls_evaluations=100):
+        """
+        Run the iterated local search algorithm using local search (EvE compromise)
+
+        Args:
+            _evaluations: {int} -- number of global evaluations for ILS
+            _ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(_evaluations)
+
+        # enable resuming for ILS
+        self.resume()
+
+        # initialize current solution
+        self.initRun()
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            # create new local search instance
+            # passing global evaluation param from ILS
+            ls = LocalSearch(self.initializer,
+                         self.evaluator,
+                         self.operators,
+                         self.policy,
+                         self.validator,
+                         self.maximise,
+                         _parent=self)
+
+            # add same callbacks
+            for callback in self.callbacks:
+                ls.addCallback(callback)
+
+            # create and search solution from local search
+            newSolution = ls.run(_ls_evaluations)
+
+            # if better solution than currently, replace it
+            if self.isBetter(newSolution):
+
+                # if better solution found from local search, retrained the found solution and test again
+                # without use of surrogate
+                fitness_score = self.real_evaluator(newSolution)
+                self.increaseEvaluation()
+
+                newSolution.score = fitness_score
+
+                # if solution is really better after real evaluation, then we replace
+                if self.isBetter(newSolution):
+                    self.bestSolution = newSolution
+
+                # save real evaluated solution into specific file for surrogate
+                with open(self.solutions_file, 'a') as f:
+
+                    line = ""
+
+                    for index, e in enumerate(newSolution.data):
+
+                        line += str(e)
+                        
+                        if index < len(newSolution.data) - 1:
+                            line += ","
+
+                    line += ";"
+                    line += str(newSolution.score)
+
+                    f.write(line + "\n")
+
+            # check if necessary or not to train again surrogate
+            if self.n_local_search % self.ls_train_surrogate == 0:
+
+                # train again surrogate on real evaluated solutions file
+                self.train_surrogate()
+
+                # reload new surrogate function
+                self.load_surrogate()
+
+            # increase number of local search done
+            self.n_local_search += 1
+
+            self.information()
+
+        logging.info("End of %s, best solution found %s" %
+                     (type(self).__name__, self.bestSolution))
+
+        self.end()
+        return self.bestSolution

+ 0 - 0
optimization/__init__.py


extract_solution_log.py → utils/extract_solution_log.py


+ 1 - 0
wsao

@@ -0,0 +1 @@
+Subproject commit 875bbdcee600f911958dbc47e319afbb8796f49d