Parcourir la source

use of sub-surrogate models

Jérôme BUISINE il y a 3 ans
Parent
commit
eb9a51980a

+ 257 - 0
find_best_attributes_surrogate_openML_multi.py

@@ -0,0 +1,257 @@
+# main imports
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import logging
+import datetime
+import random
+
+# model imports
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+
+import joblib
+import sklearn
+import sklearn.svm as svm
+from sklearn.utils import shuffle
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import MinMaxScaler
+
+# modules and config imports
+sys.path.insert(0, '') # trick to enable import of main folder module
+
+import custom_config as cfg
+import models as mdl
+
+from optimization.ILSMultiSurrogate import ILSMultiSurrogate
+from macop.solutions.BinarySolution import BinarySolution
+
+from macop.operators.mutators.SimpleMutation import SimpleMutation
+from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
+from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
+from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
+
+from macop.operators.policies.UCBPolicy import UCBPolicy
+
+from macop.callbacks.BasicCheckpoint import BasicCheckpoint
+from macop.callbacks.UCBCheckpoint import UCBCheckpoint
+from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
+from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
+
+from sklearn.ensemble import RandomForestClassifier
+
+
+# default validator
+def validator(solution):
+
+    # at least 5 attributes
+    if list(solution._data).count(1) < 2:
+        return False
+
+    return True
+
+def train_model(X_train, y_train):
+
+    #print ('Creating model...')
+    # here use of SVM with grid search CV
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
+    gammas = [0.001, 0.01, 0.1,10, 100]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True, class_weight='balanced')
+    #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
+    clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
+
+    clf.fit(X_train, y_train)
+
+    model = clf.best_estimator_
+
+    return model
+
+def loadDataset(filename):
+
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset = pd.read_csv(filename, sep=',')
+
+    # change label as common
+    min_label_value = min(dataset.iloc[:, -1])
+    max_label_value = max(dataset.iloc[:, -1])
+
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
+
+    X_dataset = dataset.iloc[:, :-1]
+    y_dataset = dataset.iloc[:, -1]
+
+    problem_size = len(X_dataset.columns)
+
+    # min/max normalisation over feature
+    # create a scaler object
+    scaler = MinMaxScaler()
+    # fit and transform the data
+    X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
+
+    # prepare train, validation and test datasets
+    X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
+
+    return X_train, y_train, X_test, y_test, problem_size
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
+
+    parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
+    parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
+    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', required=True)
+    parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--output', type=str, help='output surrogate model name')
+
+    args = parser.parse_args()
+
+    p_data_file = args.data
+    p_every_ls   = args.every_ls
+    p_k_division = args.k_division
+    p_k_dynamic = bool(args.k_dynamic)
+    p_ils_iteration = args.ils
+    p_ls_iteration  = args.ls
+    p_output = args.output
+
+    # load data from file and get problem size
+    X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
+
+    # create `logs` folder if necessary
+    if not os.path.exists(cfg.output_logs_folder):
+        os.makedirs(cfg.output_logs_folder)
+
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
+
+    # init solution (`n` attributes)
+    def init():
+        return BinarySolution([], problem_size).random(validator)
+
+    # define evaluate function here (need of data information)
+    def evaluate(solution):
+
+        start = datetime.datetime.now()
+
+        # get indices of filters data to use (filters selection from solution)
+        indices = []
+
+        for index, value in enumerate(solution._data): 
+            if value == 1: 
+                indices.append(index) 
+
+        print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
+
+        # keep only selected filters from solution
+        x_train_filters = X_train[:, indices]
+        x_test_filters = X_test[ :, indices]
+        
+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+        model = train_model(x_train_filters, y_train)
+
+        y_test_model = model.predict(x_test_filters)
+        y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
+        test_roc_auc = roc_auc_score(y_test, y_test_predict)
+
+        end = datetime.datetime.now()
+
+        diff = end - start
+
+        print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
+
+        return test_roc_auc
+
+
+    # build all output folder and files based on `output` name
+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
+
+    if not os.path.exists(backup_model_folder):
+        os.makedirs(backup_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_model_folder):
+        os.makedirs(cfg.output_surrogates_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_data_folder):
+        os.makedirs(cfg.output_surrogates_data_folder)
+
+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+    surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
+    surrogate_k_indices_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_k_indices.csv')
+
+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
+    operators = [SimpleBinaryMutation(), SimpleMutation()]
+    policy = UCBPolicy(operators)
+
+    # define first line if necessary
+    if not os.path.exists(surrogate_output_data):
+        folder, _ = os.path.split(surrogate_output_data)
+
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+
+        with open(surrogate_output_data, 'w') as f:
+            f.write('x;y\n')
+
+
+    # custom start surrogate variable based on problem size
+    p_start = int(0.5 * problem_size)
+
+    # fixed minimal number of real evaluations
+    if p_start < 50:
+        p_start = 50
+
+    print(f'Starting using surrogate after {p_start} reals training')
+
+    # custom ILS for surrogate use
+    algo = ILSMultiSurrogate(initalizer=init, 
+                        evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        surrogates_file_path=surrogate_output_model,
+                        start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_file=surrogate_output_data,
+                        ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
+                        k_division=p_k_division,
+                        k_dynamic=p_k_dynamic,
+                        maximise=True)
+    
+    algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
+    algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
+    algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
+    algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
+
+    bestSol = algo.run(p_ils_iteration, p_ls_iteration)
+
+    # print best solution found
+    print("Found ", bestSol)
+
+    # save model information into .csv file
+    if not os.path.exists(cfg.results_information_folder):
+        os.makedirs(cfg.results_information_folder)
+
+    filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
+
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
+    with open(filename_path, 'a') as f:
+        f.write(line_info + '\n')
+    
+    print('Result saved into %s' % filename_path)
+
+
+if __name__ == "__main__":
+    main()

+ 375 - 0
optimization/ILSMultiSurrogate.py

@@ -0,0 +1,375 @@
+"""Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
+"""
+
+# main imports
+import os
+import logging
+import joblib
+import time
+import math
+import numpy as np
+import pandas as pd
+
+# module imports
+from macop.algorithms.Algorithm import Algorithm
+from .LSSurrogate import LocalSearchSurrogate
+from .utils.SurrogateAnalysis import SurrogateAnalysis
+
+from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
+                                    LassoCV, ElasticNet)
+
+from wsao.sao.problems.nd3dproblem import ND3DProblem
+from wsao.sao.surrogates.walsh import WalshSurrogate
+from wsao.sao.algos.fitter import FitterAlgo
+from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
+
+class ILSMultiSurrogate(Algorithm):
+    """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate
+
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        ls_iteration: {int} -- number of evaluation for each local search algorithm
+        surrogates_file: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
+        start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
+        surrogates: [{Surrogate}] -- Surrogates model instance loaded
+        ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
+        k_division: {int} -- number of expected division for current features problem
+        k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
+        solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def __init__(self,
+                 initalizer,
+                 evaluator,
+                 operators,
+                 policy,
+                 validator,
+                 surrogate_file_path,
+                 start_train_surrogate,
+                 ls_train_surrogate,
+                 k_division,
+                 solutions_file,
+                 k_dynamic=False,
+                 maximise=True,
+                 parent=None):
+
+        # set real evaluator as default
+        super().__init__(initalizer, evaluator, operators, policy,
+                validator, maximise, parent)
+
+        self._n_local_search = 0
+        self._main_evaluator = evaluator
+
+        self._surrogate_file_path = surrogate_file_path
+        self._start_train_surrogate = start_train_surrogate
+
+        self._surrogate_evaluator = None
+        self._surrogate_analyser = None
+
+        self._ls_train_surrogate = ls_train_surrogate
+        self._solutions_file = solutions_file
+
+        self._k_division = k_division
+        self._k_dynamic = k_dynamic
+
+    def init_k_split_indices(self):
+        a = list(range(self._bestSolution._size))
+        n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
+        splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
+
+        return splitted_indices
+        
+
+    def train_surrogates(self):
+        """Retrain if necessary the whole surrogate fitness approximation function
+        """
+        # Following https://gitlab.com/florianlprt/wsao, we re-train the model
+        # ---------------------------------------------------------------------------
+        # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
+        #        model=lasso,alpha=1e-5 \
+        #        surrogate=walsh,order=3 \
+        #        algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
+        #        sample=1000,step=10 \
+        #        analysis=fitter,logfile=out_fit.csv
+
+        # TODO : pass run samples directly using train and test
+        # TODO : use of multiprocessing commands for each surrogate
+        # TODO : save each surrogate model into specific folder
+
+        # 1. Data sets preparation (train and test)
+        
+        # dynamic number of samples based on dataset real evaluations
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
+        
+        df = pd.read_csv(self._solutions_file, sep=';')
+        # learning set and test set
+        learn = df.sample(nsamples)
+        test = df.drop(learn.index)
+
+        print(f'Training all surrogate models using {training_samples} of {nsamples} samples for train dataset')
+
+        # 2. for each sub space indices, learn new surrogate
+
+        if not os.path.exists(self._surrogate_file_path):
+            os.makedirs(self._surrogate_file_path)
+
+        for i, indices in enumerate(self._k_indices):
+
+            current_learn = learn[learn.iloc[indices]]
+
+            problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
+            model = Lasso(alpha=1e-5)
+            surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
+            analysis = FitterAnalysis(logfile=f"train_surrogate_{i}.log", problem=problem)
+            algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+            print(f"Start fitting again the surrogate model n°{i}")
+            for r in range(10):
+                print(f"Iteration n°{r}: for fitting surrogate n°{i}")
+                algo.run_samples(learn=current_learn, test=test, step=10)
+
+            # keep well ordered surrogate into file manager
+            str_index = str(i)
+
+            while len(str_index) < 6:
+                str_index = "0" + str_index
+
+            joblib.dump(algo, os.path.join(self._surrogate_file_path, 'surrogate_{str_indec}'))
+
+
+    def load_surrogates(self):
+        """Load algorithm with surrogate model and create lambda evaluator function
+        """
+
+        # need to first train surrogate if not exist
+        if not os.path.exists(self._surrogate_file_path):
+            self.train_surrogates()
+
+        self._surrogates = []
+
+        surrogates_path = sorted(os.listdir(self._surrogate_file_path))
+
+        for surrogate_p in surrogates_path:
+            model_path = os.path.join(self._surrogate_file_path, surrogate_p)
+            surrogate_model = joblib.load(model_path)
+
+            self._surrogates.append(surrogate_model)
+
+    
+    def surrogate_evaluator(self, solution):
+        """Compute mean of each surrogate model using targeted indices
+
+        Args:
+            solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
+
+        Return:
+            mean: {float} -- mean score of surrogate models
+        """
+        scores = []
+        solution_data = np.array(solution._data)
+
+        # for each indices set, get trained surrogate model and made prediction score
+        for i, indices in enumerate(self._k_indices):
+            current_data = solution_data[indices]
+            current_score = self._surrogates[i].surrogate.predict([current_data])[0]
+            scores.append(current_score)
+
+        return sum(scores) / len(scores)
+            
+    def surrogates_coefficient_of_determination(self):
+        """Compute r² for each sub surrogate model
+
+        Return:
+            r_squared: {float} -- mean score of r_squred obtained from surrogate models
+        """
+
+        r_squared_scores = []
+
+        # for each indices set, get r^2 surrogate model and made prediction score
+        for i, _ in enumerate(self._k_indices):
+
+            r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
+            r_squared_scores.append(r_squared)
+
+        return sum(r_squared_scores) / len(r_squared_scores)
+
+
+
+    def add_to_surrogate(self, solution):
+
+        # save real evaluated solution into specific file for surrogate
+        with open(self._solutions_file, 'a') as f:
+
+            line = ""
+
+            for index, e in enumerate(solution._data):
+
+                line += str(e)
+                
+                if index < len(solution._data) - 1:
+                    line += ","
+
+            line += ";"
+            line += str(solution._score)
+
+            f.write(line + "\n")
+
+    def run(self, evaluations, ls_evaluations=100):
+        """
+        Run the iterated local search algorithm using local search (EvE compromise)
+
+        Args:
+            evaluations: {int} -- number of global evaluations for ILS
+            ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(evaluations)
+
+        # initialize current solution
+        self.initRun()
+
+        # based on best solution found, initialize k pool indices
+        self._k_indices = self.init_k_split_indices()
+
+        # enable resuming for ILS
+        self.resume()
+
+        # count number of surrogate obtained and restart using real evaluations done
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        if self.getGlobalEvaluation() < nsamples:
+            print(f'Restart using {nsamples} of {self._start_train_surrogate} real evaluations obtained')
+            self._numberOfEvaluations = nsamples
+
+        if self._start_train_surrogate > self.getGlobalEvaluation():
+        
+            # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
+            # using randomly generated solutions (in order to cover seearch space)
+            while self._start_train_surrogate > self.getGlobalEvaluation():
+                
+                newSolution = self._initializer()
+
+                # evaluate new solution
+                newSolution.evaluate(self._evaluator)
+
+                # add it to surrogate pool
+                self.add_to_surrogate(newSolution)
+
+                self.increaseEvaluation()
+
+        # train surrogate on real evaluated solutions file
+        self.train_surrogates()
+        self.load_surrogates()
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            # set current evaluator based on used or not of surrogate function
+            self._evaluator = self.surrogate_evaluator if self._start_train_surrogate <= self.getGlobalEvaluation() else self._main_evaluator
+
+            # create new local search instance
+            # passing global evaluation param from ILS
+            ls = LocalSearchSurrogate(self._initializer,
+                         self._evaluator,
+                         self._operators,
+                         self._policy,
+                         self._validator,
+                         self._maximise,
+                         parent=self)
+
+            # add same callbacks
+            for callback in self._callbacks:
+                ls.addCallback(callback)
+
+            # create and search solution from local search
+            newSolution = ls.run(ls_evaluations)
+
+            # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
+            # Update : always add new solution into surrogate pool, not only if solution is better
+            #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
+            if self._start_train_surrogate <= self.getGlobalEvaluation():
+
+                # if better solution found from local search, retrained the found solution and test again
+                # without use of surrogate
+                fitness_score = self._main_evaluator(newSolution)
+                # self.increaseEvaluation() # dot not add evaluation
+
+                newSolution.score = fitness_score
+
+                # if solution is really better after real evaluation, then we replace
+                if self.isBetter(newSolution):
+                    self._bestSolution = newSolution
+
+                self.add_to_surrogate(newSolution)
+
+                self.progress()
+
+            # check using specific dynamic criteria based on r^2
+            r_squared = self.surrogates_coefficient_of_determination()
+            training_surrogate_every = int(r_squared * self._ls_train_surrogate)
+            print(f"=> R^2 of surrogate is of {r_squared}. Retraining model every {training_surrogate_every} LS")
+
+            # avoid issue when lauching every each local search
+            if training_surrogate_every <= 0:
+                training_surrogate_every = 1
+
+            # check if necessary or not to train again surrogate
+            if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogate <= self.getGlobalEvaluation():
+
+                # reinitialization of k_indices for the new training
+                if self._k_dynamic:
+                    print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
+                    self.init_k_split_indices()
+
+                # train again surrogate on real evaluated solutions file
+                start_training = time.time()
+                self.train_surrogates()
+                training_time = time.time() - start_training
+
+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared, self.getGlobalMaxEvaluation(), self._n_local_search)
+
+                # reload new surrogate function
+                self.load_surrogates()
+
+            # increase number of local search done
+            self._n_local_search += 1
+
+            self.information()
+
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
+
+        self.end()
+        return self._bestSolution
+
+    def addCallback(self, callback):
+        """Add new callback to algorithm specifying usefull parameters
+
+        Args:
+            callback: {Callback} -- specific Callback instance
+        """
+        # specify current main algorithm reference
+        if self.getParent() is not None:
+            callback.setAlgo(self.getParent())
+        else:
+            callback.setAlgo(self)
+
+        # set as new
+        self._callbacks.append(callback)

+ 1 - 11
optimization/ILSSurrogate.py

@@ -10,6 +10,7 @@ import time
 # module imports
 # module imports
 from macop.algorithms.Algorithm import Algorithm
 from macop.algorithms.Algorithm import Algorithm
 from .LSSurrogate import LocalSearchSurrogate
 from .LSSurrogate import LocalSearchSurrogate
+from .utils.SurrogateAnalysis import SurrogateAnalysis
 
 
 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
                                     LassoCV, ElasticNet)
                                     LassoCV, ElasticNet)
@@ -20,17 +21,6 @@ from wsao.sao.algos.fitter import FitterAlgo
 from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
 from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
 
 
 
 
-# quick object for surrogate logging data
-class SurrogateAnalysis():
-
-    def __init__(self, time, every_ls, r2, evaluations, n_local_search):
-        self._time = time
-        self._every_ls = every_ls
-        self._r2 = r2
-        self._evaluations = evaluations
-        self._n_local_search = n_local_search
-
-
 class ILSSurrogate(Algorithm):
 class ILSSurrogate(Algorithm):
     """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
     """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
 
 

+ 96 - 0
optimization/callbacks/MultiSurrogateCheckpoint.py

@@ -0,0 +1,96 @@
+"""Basic Checkpoint class implementation
+"""
+
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.Callback import Callback
+from macop.utils.color import macop_text, macop_line
+
+
+class SurrogateCheckpoint(Callback):
+    """
+    SurrogateCheckpoint is used for logging training data information about surrogate
+
+    Attributes:
+        algo: {Algorithm} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current best solution
+        k_indices = self._algo._k_indices
+
+        # Do nothing is surrogate analyser does not exist
+        if k_indices is None:
+            return
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info(f"Multi surrogate analysis checkpoint is done into {self._filepath}")
+
+            line = str(currentEvaluation) + ';'
+
+            for indices in k_indices:
+                
+                indices_data = ""
+                indices_size = len(indices)
+
+                for index, val in enumerate(indices):
+                    indices_data += str(val)
+
+                    if index < indices_size - 1:
+                        indices_data += ' '
+
+                line += indices_data + ';'
+
+            line += '\n'
+
+            # check if file exists
+            if not os.path.exists(self._filepath):
+                with open(self._filepath, 'w') as f:
+                    f.write(line)
+            else:
+                with open(self._filepath, 'a') as f:
+                    f.write(line)
+
+    def load(self):
+        """
+        Load nothing there, as we only log surrogate training information
+        """
+        if os.path.exists(self._filepath):
+
+            logging.info('Load best solution from last checkpoint')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1]
+                data = lastline.split(';')
+
+                k_indices = data[1:]
+                k_indices_final = []
+
+                for indices in k_indices:
+                    k_indices_final.append(list(map(int, indices.split(' '))))
+
+                # set k_indices into main algorithm
+                self._algo._k_indices = k_indices_final
+
+            print(macop_line())
+            print(macop_text(f' MultiSurrogateCheckpoint found from `{self._filepath}` file.'))
+
+        else:
+            print(macop_text('No backup found... Start running using new `k_indices` values'))
+            logging.info("Can't load MultiSurrogate backup... Backup filepath not valid in  MultiSurrogateCheckpoint")
+
+        print(macop_line())
+

+ 10 - 0
optimization/utils/SurrogateAnalysis.py

@@ -0,0 +1,10 @@
+# quick object for surrogate logging data
+class SurrogateAnalysis():
+
+    def __init__(self, time, every_ls, r2, evaluations, n_local_search):
+        self._time = time
+        self._every_ls = every_ls
+        self._r2 = r2
+        self._evaluations = evaluations
+        self._n_local_search = n_local_search
+