Parcourir la source

Merge branch 'release/v0.3.5'

Jérôme BUISINE il y a 3 ans
Parent
commit
0f6308b682

Fichier diff supprimé car celui-ci est trop grand
+ 3280 - 0
OpenML_datasets/internet-advertissment.csv


Fichier diff supprimé car celui-ci est trop grand
+ 0 - 2601
OpenML_datasets/madelon.csv


+ 64 - 0
features_selection/methods.py

@@ -0,0 +1,64 @@
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import chi2
+from sklearn.svm import LinearSVC
+from sklearn.feature_selection import SelectFromModel
+from sklearn.svm import SVC
+from sklearn.model_selection import StratifiedKFold
+from sklearn.feature_selection import RFECV
+from sklearn.ensemble import ExtraTreesClassifier
+
+features_selection_list = [
+    "variance_threshold",
+    "kbest",
+    "linearSVC",
+    "tree",
+    "rfecv"
+]
+
+def features_selection_method(name, params, X_train, y_train, problem_size):
+
+    indices = []
+
+    if name == "variance_threshold":
+        percent_to_keep = float(params)
+        #sel = VarianceThreshold(threshold=(percent_to_keep * (1 - percent_to_keep)))
+        sel = VarianceThreshold(threshold=(percent_to_keep))
+        sel.fit_transform(X_train)
+
+        indices = sel.get_support(indices=True)
+
+    if name == "kbest":
+        k_param = int(float(params) * problem_size) # here it's a percent over the whole dataset
+        model = SelectKBest(chi2, k=k_param).fit_transform(X_train, y_train)
+
+        indices = model.get_support(indices=True)
+
+    if name == "linearSVC":
+        C_param = float(params)
+        lsvc = LinearSVC(C=C_param, penalty="l1", dual=False).fit(X_train, y_train)
+        model = SelectFromModel(lsvc, prefit=True)
+
+        indices = model.get_support(indices=True)
+
+    if name == "tree":
+        n_estimarors_param = int(params)
+        clf = ExtraTreesClassifier(n_estimators=n_estimarors_param)
+        clf = clf.fit(X_train, y_train)
+        model = SelectFromModel(clf, prefit=True)
+
+        indices = model.get_support(indices=True)
+
+    if name == "rfecv":
+        cv_param = int(params)
+        # Create the RFE object and compute a cross-validated score
+        svc = SVC(kernel="linear")
+        # The "accuracy" scoring is proportional to the number of correct
+        # classifications
+        rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(cv_param),
+                    scoring='roc_auc')
+        rfecv.fit(X_train, y_train)
+
+        indices = rfecv.get_support(indices=True)
+
+    return indices

+ 71 - 0
features_selection/run_all_openML.py

@@ -0,0 +1,71 @@
+import os, argparse
+
+params = {
+    "variance_threshold": [
+        "0.001",
+        "0.01",
+        "0.05",
+        "0.1",
+    ],
+    "kbest": [
+        "0.9",
+        "0.8",
+        "0.7",
+        "0.6",
+    ],
+    "linearSVC": [
+        "0.1",
+        "1",
+        "10",
+        "100"
+    ],
+    "tree": [
+        "10",
+        "50",
+        "100",
+        "200",
+    ],
+    "rfecv": [
+        "3",
+        "4",
+        "5"
+    ]
+}
+
+open_ml_problems_folder = 'OpenML_datasets'
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Get features extraction from specific methods and params")
+
+    parser.add_argument('--ntrain', type=int, help='number of training in order to keep mean of score', default=1)
+    parser.add_argument('--output', type=str, help='output features selection results', required=True)
+
+    args = parser.parse_args()
+
+    p_ntrain    = args.ntrain
+    p_output    = args.output
+
+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
+
+    for ml_problem in open_ml_problems:
+
+        ml_problem_name = ml_problem.replace('.csv', '')
+        ml_problem_path = os.path.join(open_ml_problems_folder, ml_problem)
+
+        for key, values in params.items():
+
+            for param in values:
+
+                print(f'Run features selection for OpenML `{ml_problem_name}` problem with {{method: {key}, params: {param}, ntrain: {p_ntrain}}}')
+                command_str = f'python features_selection/run_method_openML.py ' \
+                            f'--data {ml_problem_path} ' \
+                            f'--method {key} ' \
+                            f'--params {param} ' \
+                            f'--ntrain {p_ntrain} ' \
+                            f'--output {p_output}'
+                             
+                os.system(command_str)
+
+if __name__ == "__main__":
+    main()

+ 147 - 0
features_selection/run_method_openML.py

@@ -0,0 +1,147 @@
+import os, argparse
+
+import numpy as np
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import roc_auc_score, accuracy_score
+import sklearn.svm as svm
+
+from methods import features_selection_list, features_selection_method
+
+
+def train_model(X_train, y_train):
+
+    print ('Creating model...')
+    # here use of SVM with grid search CV
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1,10, 100, 1000]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True, class_weight='balanced')
+    clf = GridSearchCV(svc, param_grid, cv=2, verbose=1, n_jobs=-1)
+
+    clf.fit(X_train, y_train)
+
+    model = clf.best_estimator_
+
+    return model
+
+def loadDataset(filename):
+
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset = pd.read_csv(filename, sep=',')
+
+    # change label as common
+    min_label_value = min(dataset.iloc[:, -1])
+    max_label_value = max(dataset.iloc[:, -1])
+
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
+
+    X_dataset = dataset.iloc[:, :-1]
+    y_dataset = dataset.iloc[:, -1]
+
+    problem_size = len(X_dataset.columns)
+
+    # min/max normalisation over feature
+    # create a scaler object
+    scaler = MinMaxScaler()
+    # fit and transform the data
+    X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
+
+    # prepare train, validation and test datasets
+    X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
+
+    return X_train, y_train, X_test, y_test, problem_size
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Get features extraction from specific method")
+
+    parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
+    parser.add_argument('--method', type=str, help='method name to use', choices=features_selection_list, required=True)
+    parser.add_argument('--params', type=str, help='params used for the current selected method', required=True)
+    parser.add_argument('--ntrain', type=int, help='number of training in order to keep mean of score', default=1)
+    parser.add_argument('--output', type=str, help='output features selection results')
+
+    args = parser.parse_args()
+
+    p_data_file = args.data
+    p_method    = args.method
+    p_params    = args.params
+    p_ntrain    = args.ntrain
+    p_output    = args.output
+
+    # load data from file and get problem size
+    X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
+
+    # extract indices selected features
+    features_indices = features_selection_method(p_method, p_params, X_train, y_train, problem_size)
+
+    print(f'Selected features {len(features_indices)} over {problem_size}')
+
+    auc_scores = []
+    acc_scores = []
+    
+    for i in range(p_ntrain):
+
+        # new split of dataset
+        X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
+
+        # get reduced dataset
+        X_train_reduced = X_train[:, features_indices]
+        X_test_reduced = X_test[:, features_indices]
+
+
+        # get trained model over reduce dataset
+        model = train_model(X_train_reduced, y_train)
+
+        # get predicted labels over test dataset
+        y_test_model = model.predict(X_test_reduced)
+        y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
+        test_roc_auc = roc_auc_score(y_test, y_test_predict)
+        test_acc = accuracy_score(y_test, y_test_predict)
+
+        print(f'Run n°{i}: {test_roc_auc} (AUC ROC)')
+
+        # append score into list of run
+        auc_scores.append(test_roc_auc)
+        acc_scores.append(test_acc)
+
+    mean_auc_score = sum(auc_scores) / len(auc_scores)
+    mean_acc_score = sum(acc_scores) / len(acc_scores)
+
+    var_acc_score = np.var(acc_scores)
+    var_auc_score = np.var(auc_scores)
+
+    std_acc_score = np.std(acc_scores)
+    std_auc_score = np.std(auc_scores)
+
+    print(f'Model performance using {p_method} (params: {p_params}) is of {mean_auc_score:.2f}')
+
+    # now save trained model and params obtained
+    header_line = 'dataset;method;params;ntrain;n_features;acc_test;auc_test;var_acc_test;var_auc_test;std_acc_test;std_auc_test;features_indices\n'
+    data_line = f'{p_data_file};{p_method};{p_params};{p_ntrain};{len(features_indices)};{mean_acc_score};{mean_auc_score};{var_acc_score};{var_auc_score};{std_acc_score};{std_auc_score};{" ".join(list(map(str, features_indices)))}\n'
+
+    output_folder, _ = os.path.split(p_output)
+
+    if len(output_folder) > 0:
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+
+    if not os.path.exists(p_output):
+        with open(p_output, 'w') as f:
+            f.write(header_line)
+
+    with open(p_output, 'a') as f:
+        f.write(data_line)
+    
+
+if __name__ == "__main__":
+    main()

+ 1 - 1
find_best_attributes.py

@@ -190,7 +190,7 @@ def main():
                 filters_counter += 1
                 filters_counter += 1
 
 
 
 
-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
     with open(filename_path, 'a') as f:
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
         f.write(line_info + '\n')
     
     

+ 91 - 63
find_best_attributes_surrogate.py

@@ -26,20 +26,23 @@ sys.path.insert(0, '') # trick to enable import of main folder module
 import custom_config as cfg
 import custom_config as cfg
 import models as mdl
 import models as mdl
 
 
-from optimization.ILSSurrogate import ILSSurrogate
-from macop.solutions.BinarySolution import BinarySolution
+from optimization.ILSPopSurrogate import ILSPopSurrogate
+from macop.solutions.discrete import BinarySolution
+from macop.evaluators.base import Evaluator
 
 
-from macop.operators.mutators.SimpleMutation import SimpleMutation
-from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
-from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
-from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
+from macop.operators.discrete.mutators import SimpleMutation
+from macop.operators.discrete.mutators import SimpleBinaryMutation
+from macop.operators.discrete.crossovers import SimpleCrossover
+from macop.operators.discrete.crossovers import RandomSplitCrossover
+from optimization.operators.SimplePopCrossover import SimplePopCrossover, RandomPopCrossover
 
 
-from macop.operators.policies.UCBPolicy import UCBPolicy
+from macop.policies.reinforcement import UCBPolicy
 
 
-from macop.callbacks.BasicCheckpoint import BasicCheckpoint
-from macop.callbacks.UCBCheckpoint import UCBCheckpoint
+from macop.callbacks.classicals import BasicCheckpoint
+from macop.callbacks.policies import UCBCheckpoint
+from optimization.callbacks.MultiPopCheckpoint import MultiPopCheckpoint
 
 
-from sklearn.ensemble import RandomForestClassifier
+#from sklearn.ensemble import RandomForestClassifier
 
 
 # variables and parameters
 # variables and parameters
 models_list         = cfg.models_names_list
 models_list         = cfg.models_names_list
@@ -58,6 +61,8 @@ def loadDataset(filename):
     ########################
     ########################
     # 1. Get and prepare data
     # 1. Get and prepare data
     ########################
     ########################
+    # scene_name; zone_id; image_index_end; label; data
+
     dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
     dataset_train = pd.read_csv(filename + '.train', header=None, sep=";")
     dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
     dataset_test = pd.read_csv(filename + '.test', header=None, sep=";")
 
 
@@ -66,12 +71,12 @@ def loadDataset(filename):
     dataset_test = shuffle(dataset_test)
     dataset_test = shuffle(dataset_test)
 
 
     # get dataset with equal number of classes occurences
     # get dataset with equal number of classes occurences
-    noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
-    not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
+    noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.iloc[:, 3] == 0]
     #nb_noisy_train = len(noisy_df_train.index)
     #nb_noisy_train = len(noisy_df_train.index)
 
 
-    noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
-    not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
+    noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.iloc[:, 3] == 0]
     #nb_noisy_test = len(noisy_df_test.index)
     #nb_noisy_test = len(noisy_df_test.index)
 
 
     # use of all data
     # use of all data
@@ -83,22 +88,40 @@ def loadDataset(filename):
     final_df_test = shuffle(final_df_test)
     final_df_test = shuffle(final_df_test)
 
 
     # use of the whole data set for training
     # use of the whole data set for training
-    x_dataset_train = final_df_train.iloc[:,1:]
-    x_dataset_test = final_df_test.iloc[:,1:]
+    x_dataset_train = final_df_train.iloc[:, 4:]
+    x_dataset_test = final_df_test.iloc[:, 4:]
 
 
-    y_dataset_train = final_df_train.iloc[:,0]
-    y_dataset_test = final_df_test.iloc[:,0]
+    y_dataset_train = final_df_train.iloc[:, 3]
+    y_dataset_test = final_df_test.iloc[:, 3]
 
 
     return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
     return x_dataset_train, y_dataset_train, x_dataset_test, y_dataset_test
 
 
+def _get_best_model(X_train, y_train):
+
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1, 5, 10, 100]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True, class_weight='balanced')
+    #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
+    clf = GridSearchCV(svc, param_grid, cv=5, verbose=0, n_jobs=-1)
+
+    clf.fit(X_train, y_train)
+
+    model = clf.best_estimator_
+
+    return model
+
 def main():
 def main():
 
 
     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
 
 
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
-    parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list, default=models_list[0], required=False)
-    parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=1000)
+    parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', required=True)
+    parser.add_argument('--train_every', type=int, help='max number of evalution before retraining surrogare model', required=True)
     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
+    parser.add_argument('--pop', type=int, help='pop size', required=True)
+    parser.add_argument('--order', type=int, help='walsh order function', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
     parser.add_argument('--output', type=str, help='output surrogate model name')
     parser.add_argument('--output', type=str, help='output surrogate model name')
@@ -106,9 +129,11 @@ def main():
     args = parser.parse_args()
     args = parser.parse_args()
 
 
     p_data_file = args.data
     p_data_file = args.data
-    p_choice    = args.choice
     p_length    = args.length
     p_length    = args.length
+    p_pop       = args.pop
+    p_order     = args.order
     p_start     = args.start_surrogate
     p_start     = args.start_surrogate
+    p_retrain   = args.train_every
     p_ils_iteration = args.ils
     p_ils_iteration = args.ils
     p_ls_iteration  = args.ls
     p_ls_iteration  = args.ls
     p_output = args.output
     p_output = args.output
@@ -126,42 +151,41 @@ def main():
 
 
     # init solution (`n` attributes)
     # init solution (`n` attributes)
     def init():
     def init():
-        return BinarySolution([], p_length
-        ).random(validator)
+        return BinarySolution.random(p_length, validator)
 
 
-    # define evaluate function here (need of data information)
-    def evaluate(solution):
 
 
-        start = datetime.datetime.now()
+    class SVMEvaluator(Evaluator):
 
 
-        # get indices of filters data to use (filters selection from solution)
-        indices = []
+        # define evaluate function here (need of data information)
+        def compute(self, solution):
+            start = datetime.datetime.now()
 
 
-        for index, value in enumerate(solution.data): 
-            if value == 1: 
-                indices.append(index) 
+            # get indices of filters data to use (filters selection from solution)
+            indices = []
 
 
-        # keep only selected filters from solution
-        x_train_filters = x_train.iloc[:, indices]
-        y_train_filters = y_train
-        x_test_filters = x_test.iloc[:, indices]
-        
-        # TODO : use of GPU implementation of SVM
-        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+            for index, value in enumerate(solution.data): 
+                if value == 1: 
+                    indices.append(index) 
 
 
-        model = RandomForestClassifier(n_estimators=10)
-        model = model.fit(x_train_filters, y_train_filters)
-        
-        y_test_model = model.predict(x_test_filters)
-        test_roc_auc = roc_auc_score(y_test, y_test_model)
+            # keep only selected filters from solution
+            x_train_filters = self._data['x_train'].iloc[:, indices]
+            y_train_filters = self._data['y_train']
+            x_test_filters = self._data['x_test'].iloc[:, indices]
+            
+            model = _get_best_model(x_train_filters, y_train_filters)
+            #model = RandomForestClassifier(n_estimators=10)
+            #model = model.fit(x_train_filters, y_train_filters)
+            
+            y_test_model = model.predict(x_test_filters)
+            test_roc_auc = roc_auc_score(self._data['y_test'], y_test_model)
 
 
-        end = datetime.datetime.now()
+            end = datetime.datetime.now()
 
 
-        diff = end - start
+            diff = end - start
 
 
-        print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
+            #print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
 
 
-        return test_roc_auc
+            return test_roc_auc
 
 
 
 
     # build all output folder and files based on `output` name
     # build all output folder and files based on `output` name
@@ -182,28 +206,31 @@ def main():
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
 
 
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
-    operators = [SimpleBinaryMutation(), SimpleMutation()]
-    policy = UCBPolicy(operators)
+    operators = [SimpleBinaryMutation(), SimpleMutation(), RandomPopCrossover(), SimplePopCrossover()]
+    policy = UCBPolicy(operators, C=100, exp_rate=0.1)
 
 
     # define first line if necessary
     # define first line if necessary
     if not os.path.exists(surrogate_output_data):
     if not os.path.exists(surrogate_output_data):
-        with open(surrogate_output_data) as f:
+        with open(surrogate_output_data, 'w') as f:
             f.write('x;y\n')
             f.write('x;y\n')
 
 
     # custom ILS for surrogate use
     # custom ILS for surrogate use
-    algo = ILSSurrogate(_initalizer=init, 
-                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
-                        _operators=operators, 
-                        _policy=policy, 
-                        _validator=validator,
-                        _surrogate_file_path=surrogate_output_model,
-                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
-                        _solutions_file=surrogate_output_data,
-                        _ls_train_surrogate=1,
-                        _maximise=True)
+    algo = ILSPopSurrogate(initalizer=init, 
+                        evaluator=SVMEvaluator(data={'x_train': x_train, 'y_train': y_train, 'x_test': x_test, 'y_test': y_test}), # same evaluator by default, as we will use the surrogate function
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        population_size=p_pop,
+                        surrogate_file_path=surrogate_output_model,
+                        start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_file=surrogate_output_data,
+                        walsh_order=p_order,
+                        inter_policy_ls_file=os.path.join(backup_model_folder, p_output + '_ls_ucbPolicy.csv'),
+                        ls_train_surrogate=p_retrain,
+                        maximise=True)
     
     
-    algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
-    algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
+    algo.addCallback(MultiPopCheckpoint(every=1, filepath=backup_file_path))
+    algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
 
 
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 
 
@@ -217,6 +244,7 @@ def main():
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
 
 
     filters_counter = 0
     filters_counter = 0
+
     # count number of filters
     # count number of filters
     for index, item in enumerate(bestSol.data):
     for index, item in enumerate(bestSol.data):
         if index != 0 and index % 2 == 1:
         if index != 0 and index % 2 == 1:
@@ -226,7 +254,7 @@ def main():
                 filters_counter += 1
                 filters_counter += 1
 
 
 
 
-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
     with open(filename_path, 'a') as f:
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
         f.write(line_info + '\n')
     
     

+ 20 - 15
find_best_attributes_surrogate_dl.py

@@ -79,7 +79,7 @@ def build_input(df):
 def validator(solution):
 def validator(solution):
 
 
     # at least 5 attributes
     # at least 5 attributes
-    if list(solution.data).count(1) < 5:
+    if list(solution._data).count(1) < 5:
         return False
         return False
 
 
     return True
     return True
@@ -168,6 +168,7 @@ def main():
     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
     parser.add_argument('--length', type=int, help='max data length (need to be specify for evaluator)', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--every_ls', type=int, help='number of max iteration for retraining surrogate model', required=True)
     parser.add_argument('--output', type=str, help='output surrogate model name')
     parser.add_argument('--output', type=str, help='output surrogate model name')
 
 
     args = parser.parse_args()
     args = parser.parse_args()
@@ -177,6 +178,7 @@ def main():
     p_start     = args.start_surrogate
     p_start     = args.start_surrogate
     p_ils_iteration = args.ils
     p_ils_iteration = args.ils
     p_ls_iteration  = args.ls
     p_ls_iteration  = args.ls
+    p_every_ls      = args.every_ls
     p_output = args.output
     p_output = args.output
 
 
     print(p_data_file)
     print(p_data_file)
@@ -202,7 +204,7 @@ def main():
         # get indices of filters data to use (filters selection from solution)
         # get indices of filters data to use (filters selection from solution)
         indices = []
         indices = []
 
 
-        for index, value in enumerate(solution.data): 
+        for index, value in enumerate(solution._data): 
             if value == 1: 
             if value == 1: 
                 indices.append(index) 
                 indices.append(index) 
 
 
@@ -230,6 +232,7 @@ def main():
         test_roc_auc = roc_auc_score(y_test, y_test_predict)
         test_roc_auc = roc_auc_score(y_test, y_test_predict)
 
 
         end = datetime.datetime.now()
         end = datetime.datetime.now()
+        del model
 
 
         diff = end - start
         diff = end - start
 
 
@@ -254,6 +257,7 @@ def main():
 
 
     backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
     backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+    surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
 
 
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     operators = [SimpleBinaryMutation(), SimpleMutation()]
     operators = [SimpleBinaryMutation(), SimpleMutation()]
@@ -270,19 +274,20 @@ def main():
             f.write('x;y\n')
             f.write('x;y\n')
 
 
     # custom ILS for surrogate use
     # custom ILS for surrogate use
-    algo = ILSSurrogate(_initalizer=init, 
-                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
-                        _operators=operators, 
-                        _policy=policy, 
-                        _validator=validator,
-                        _surrogate_file_path=surrogate_output_model,
-                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
-                        _solutions_file=surrogate_output_data,
-                        _ls_train_surrogate=1,
-                        _maximise=True)
+    algo = ILSSurrogate(initalizer=init, 
+                        evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        surrogate_file_path=surrogate_output_model,
+                        start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_file=surrogate_output_data,
+                        ls_train_surrogate=p_every_ls,
+                        maximise=True)
     
     
-    algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
-    algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
+    algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
+    algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
+    algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
 
 
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 
 
@@ -305,7 +310,7 @@ def main():
                 filters_counter += 1
                 filters_counter += 1
 
 
 
 
-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(filters_counter) + ';' + str(bestSol.fitness)
     with open(filename_path, 'a') as f:
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
         f.write(line_info + '\n')
     
     

+ 32 - 28
find_best_attributes_surrogate_openML.py

@@ -14,10 +14,6 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
 
-from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization
-from keras.preprocessing.sequence import pad_sequences
-from keras.models import Sequential
-
 import joblib
 import joblib
 import sklearn
 import sklearn
 import sklearn.svm as svm
 import sklearn.svm as svm
@@ -44,6 +40,7 @@ from macop.operators.policies.UCBPolicy import UCBPolicy
 
 
 from macop.callbacks.BasicCheckpoint import BasicCheckpoint
 from macop.callbacks.BasicCheckpoint import BasicCheckpoint
 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
 from macop.callbacks.UCBCheckpoint import UCBCheckpoint
+from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
 
 
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestClassifier
 
 
@@ -52,22 +49,22 @@ from sklearn.ensemble import RandomForestClassifier
 def validator(solution):
 def validator(solution):
 
 
     # at least 5 attributes
     # at least 5 attributes
-    if list(solution.data).count(1) < 5:
+    if list(solution._data).count(1) < 2:
         return False
         return False
 
 
     return True
     return True
 
 
 def train_model(X_train, y_train):
 def train_model(X_train, y_train):
 
 
-    print ('Creating model...')
+    #print ('Creating model...')
     # here use of SVM with grid search CV
     # here use of SVM with grid search CV
-    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-    gammas = [0.001, 0.01, 0.1, 5, 10, 100]
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
+    gammas = [0.001, 0.01, 0.1,10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
 
 
     svc = svm.SVC(probability=True, class_weight='balanced')
     svc = svm.SVC(probability=True, class_weight='balanced')
     #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
     #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
-    clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)
+    clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
 
 
     clf.fit(X_train, y_train)
     clf.fit(X_train, y_train)
 
 
@@ -77,8 +74,6 @@ def train_model(X_train, y_train):
 
 
 def loadDataset(filename):
 def loadDataset(filename):
 
 
-    # TODO : load data using DL RNN 
-
     ########################
     ########################
     # 1. Get and prepare data
     # 1. Get and prepare data
     ########################
     ########################
@@ -113,7 +108,7 @@ def main():
     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
     parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
 
 
     parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
     parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
-    #parser.add_argument('--start_surrogate', type=int, help='number of evalution before starting surrogare model', default=100)
+    parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
     parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
     parser.add_argument('--output', type=str, help='output surrogate model name')
     parser.add_argument('--output', type=str, help='output surrogate model name')
@@ -121,7 +116,7 @@ def main():
     args = parser.parse_args()
     args = parser.parse_args()
 
 
     p_data_file = args.data
     p_data_file = args.data
-    #p_start     = args.start_surrogate
+    p_every_ls   = args.every_ls
     p_ils_iteration = args.ils
     p_ils_iteration = args.ils
     p_ls_iteration  = args.ls
     p_ls_iteration  = args.ls
     p_output = args.output
     p_output = args.output
@@ -147,10 +142,12 @@ def main():
         # get indices of filters data to use (filters selection from solution)
         # get indices of filters data to use (filters selection from solution)
         indices = []
         indices = []
 
 
-        for index, value in enumerate(solution.data): 
+        for index, value in enumerate(solution._data): 
             if value == 1: 
             if value == 1: 
                 indices.append(index) 
                 indices.append(index) 
 
 
+        print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
+
         # keep only selected filters from solution
         # keep only selected filters from solution
         x_train_filters = X_train[:, indices]
         x_train_filters = X_train[:, indices]
         x_test_filters = X_test[ :, indices]
         x_test_filters = X_test[ :, indices]
@@ -187,6 +184,7 @@ def main():
 
 
     backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
     backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
     ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+    surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
 
 
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
     operators = [SimpleBinaryMutation(), SimpleMutation()]
     operators = [SimpleBinaryMutation(), SimpleMutation()]
@@ -204,23 +202,29 @@ def main():
 
 
 
 
     # custom start surrogate variable based on problem size
     # custom start surrogate variable based on problem size
-    p_start = int(problem_size)
+    p_start = int(0.5 * problem_size)
+
+    # fixed limit
+    if p_start < 50:
+        p_start = 50
+
     print(f'Starting using surrogate after {p_start} reals training')
     print(f'Starting using surrogate after {p_start} reals training')
 
 
     # custom ILS for surrogate use
     # custom ILS for surrogate use
-    algo = ILSSurrogate(_initalizer=init, 
-                        _evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
-                        _operators=operators, 
-                        _policy=policy, 
-                        _validator=validator,
-                        _surrogate_file_path=surrogate_output_model,
-                        _start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
-                        _solutions_file=surrogate_output_data,
-                        _ls_train_surrogate=1,
-                        _maximise=True)
+    algo = ILSSurrogate(initalizer=init, 
+                        evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        surrogate_file_path=surrogate_output_model,
+                        start_train_surrogate=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_file=surrogate_output_data,
+                        ls_train_surrogate=p_every_ls, # retrain surrogate every 5 iteration
+                        maximise=True)
     
     
-    algo.addCallback(BasicCheckpoint(_every=1, _filepath=backup_file_path))
-    algo.addCallback(UCBCheckpoint(_every=1, _filepath=ucb_backup_file_path))
+    algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
+    algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
+    algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
 
 
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
     bestSol = algo.run(p_ils_iteration, p_ls_iteration)
 
 
@@ -233,7 +237,7 @@ def main():
 
 
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
 
 
-    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness)
     with open(filename_path, 'a') as f:
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
         f.write(line_info + '\n')
     
     

+ 271 - 0
find_best_attributes_surrogate_openML_multi.py

@@ -0,0 +1,271 @@
+# main imports
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import logging
+import datetime
+import random
+
+# model imports
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+
+import joblib
+import sklearn
+import sklearn.svm as svm
+from sklearn.utils import shuffle
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import MinMaxScaler
+
+# modules and config imports
+sys.path.insert(0, '') # trick to enable import of main folder module
+
+import custom_config as cfg
+import models as mdl
+
+from optimization.ILSMultiSurrogate import ILSMultiSurrogate
+from macop.solutions.BinarySolution import BinarySolution
+
+from macop.operators.mutators.SimpleMutation import SimpleMutation
+from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
+from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
+from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
+
+from macop.operators.policies.UCBPolicy import UCBPolicy
+
+from macop.callbacks.BasicCheckpoint import BasicCheckpoint
+from macop.callbacks.UCBCheckpoint import UCBCheckpoint
+from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
+from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
+
+from sklearn.ensemble import RandomForestClassifier
+
+# avoid display of warning
+def warn(*args, **kwargs):
+    pass
+
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.warn = warn
+
+# default validator
+def validator(solution):
+
+    # at least 5 attributes
+    if list(solution._data).count(1) < 2:
+        return False
+
+    return True
+
+def train_model(X_train, y_train):
+
+    #print ('Creating model...')
+    # here use of SVM with grid search CV
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
+    gammas = [0.001, 0.01, 0.1,10, 100]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True, class_weight='balanced')
+    #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
+    clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
+
+    clf.fit(X_train, y_train)
+
+    model = clf.best_estimator_
+
+    return model
+
+def loadDataset(filename):
+
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset = pd.read_csv(filename, sep=',')
+
+    # change label as common
+    min_label_value = min(dataset.iloc[:, -1])
+    max_label_value = max(dataset.iloc[:, -1])
+
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
+
+    X_dataset = dataset.iloc[:, :-1]
+    y_dataset = dataset.iloc[:, -1]
+
+    problem_size = len(X_dataset.columns)
+
+    # min/max normalisation over feature
+    # create a scaler object
+    scaler = MinMaxScaler()
+    # fit and transform the data
+    X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
+
+    # prepare train, validation and test datasets
+    X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
+
+    return X_train, y_train, X_test, y_test, problem_size
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
+
+    parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
+    parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
+    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
+    parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
+    parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
+    parser.add_argument('--output', type=str, help='output surrogate model name')
+
+    args = parser.parse_args()
+
+    p_data_file = args.data
+    p_every_ls   = args.every_ls
+    p_k_division = args.k_division
+    p_k_dynamic = bool(args.k_dynamic)
+    p_k_random = bool(args.k_random)
+    p_ils_iteration = args.ils
+    p_ls_iteration  = args.ls
+    p_generate_only = bool(args.generate_only)
+    p_output = args.output
+
+    # load data from file and get problem size
+    X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
+
+    # create `logs` folder if necessary
+    if not os.path.exists(cfg.output_logs_folder):
+        os.makedirs(cfg.output_logs_folder)
+
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
+
+    # init solution (`n` attributes)
+    def init():
+        return BinarySolution([], problem_size).random(validator)
+
+    # define evaluate function here (need of data information)
+    def evaluate(solution):
+
+        start = datetime.datetime.now()
+
+        # get indices of filters data to use (filters selection from solution)
+        indices = []
+
+        for index, value in enumerate(solution._data): 
+            if value == 1: 
+                indices.append(index) 
+
+        print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
+
+        # keep only selected filters from solution
+        x_train_filters = X_train[:, indices]
+        x_test_filters = X_test[ :, indices]
+        
+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+        model = train_model(x_train_filters, y_train)
+
+        y_test_model = model.predict(x_test_filters)
+        y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
+        test_roc_auc = roc_auc_score(y_test, y_test_predict)
+
+        end = datetime.datetime.now()
+
+        diff = end - start
+
+        print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
+
+        return test_roc_auc
+
+
+    # build all output folder and files based on `output` name
+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
+
+    if not os.path.exists(backup_model_folder):
+        os.makedirs(backup_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_model_folder):
+        os.makedirs(cfg.output_surrogates_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_data_folder):
+        os.makedirs(cfg.output_surrogates_data_folder)
+
+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+    surrogate_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_train.csv')
+    surrogate_k_indices_backup_file_path = os.path.join(cfg.output_surrogates_data_folder, p_output + '_k_indices.csv')
+
+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
+    operators = [SimpleBinaryMutation(), SimpleMutation()]
+    policy = UCBPolicy(operators)
+
+    # define first line if necessary
+    if not os.path.exists(surrogate_output_data):
+        folder, _ = os.path.split(surrogate_output_data)
+
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+
+        with open(surrogate_output_data, 'w') as f:
+            f.write('x;y\n')
+
+
+    # custom start surrogate variable based on problem size
+    p_start = int(0.5 * problem_size)
+
+    # fixed minimal number of real evaluations
+    if p_start < 50:
+        p_start = 50
+
+    print(f'Starting using surrogate after {p_start} reals training')
+
+    # custom ILS for surrogate use
+    algo = ILSMultiSurrogate(initalizer=init, 
+                        evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
+                        surrogates_file_path=surrogate_output_model,
+                        start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_file=surrogate_output_data,
+                        ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
+                        k_division=p_k_division,
+                        k_dynamic=p_k_dynamic,
+                        k_random=p_k_random,
+                        generate_only=p_generate_only,
+                        maximise=True)
+    
+    algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
+    algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
+    algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
+    algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
+
+    bestSol = algo.run(p_ils_iteration, p_ls_iteration)
+
+    # print best solution found
+    print("Found ", bestSol)
+
+    # save model information into .csv file
+    if not os.path.exists(cfg.results_information_folder):
+        os.makedirs(cfg.results_information_folder)
+
+    filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
+
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness)
+    with open(filename_path, 'a') as f:
+        f.write(line_info + '\n')
+    
+    print('Result saved into %s' % filename_path)
+
+
+if __name__ == "__main__":
+    main()

+ 299 - 0
find_best_attributes_surrogate_openML_multi_specific.py

@@ -0,0 +1,299 @@
+# main imports
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import logging
+import datetime
+import random
+
+# model imports
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, VotingClassifier
+
+import joblib
+import sklearn
+import sklearn.svm as svm
+from sklearn.utils import shuffle
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import MinMaxScaler
+
+# modules and config imports
+sys.path.insert(0, '') # trick to enable import of main folder module
+
+import custom_config as cfg
+import models as mdl
+
+from optimization.ILSMultiSpecificSurrogate import ILSMultiSpecificSurrogate
+from macop.solutions.BinarySolution import BinarySolution
+
+from macop.operators.mutators.SimpleMutation import SimpleMutation
+from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
+from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
+from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
+
+from macop.operators.policies.UCBPolicy import UCBPolicy
+from macop.operators.policies.RandomPolicy import RandomPolicy
+
+from macop.callbacks.BasicCheckpoint import BasicCheckpoint
+from macop.callbacks.UCBCheckpoint import UCBCheckpoint
+from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
+from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
+from optimization.callbacks.MultiSurrogateSpecificCheckpoint import MultiSurrogateSpecificCheckpoint
+
+from sklearn.ensemble import RandomForestClassifier
+
+# avoid display of warning
+def warn(*args, **kwargs):
+    pass
+
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.warn = warn
+
+# default validator
+def validator(solution):
+
+    # at least 5 attributes
+    if list(solution._data).count(1) < 2:
+        return False
+
+    return True
+
+def train_model(X_train, y_train):
+
+    #print ('Creating model...')
+    # here use of SVM with grid search CV
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
+    gammas = [0.001, 0.01, 0.1,10, 100]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True, class_weight='balanced')
+    #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
+    clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
+
+    clf.fit(X_train, y_train)
+
+    model = clf.best_estimator_
+
+    return model
+
+def loadDataset(filename):
+
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset = pd.read_csv(filename, sep=',')
+
+    # change label as common
+    min_label_value = min(dataset.iloc[:, -1])
+    max_label_value = max(dataset.iloc[:, -1])
+
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
+    dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
+
+    X_dataset = dataset.iloc[:, :-1]
+    y_dataset = dataset.iloc[:, -1]
+
+    problem_size = len(X_dataset.columns)
+
+    # min/max normalisation over feature
+    # create a scaler object
+    scaler = MinMaxScaler()
+    # fit and transform the data
+    X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
+
+    # prepare train, validation and test datasets
+    X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
+
+    return X_train, y_train, X_test, y_test, problem_size
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
+
+    parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
+    parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
+    parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
+    parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
+    parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+    parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
+    parser.add_argument('--output', type=str, help='output surrogate model name')
+
+    args = parser.parse_args()
+
+    p_data_file = args.data
+    p_every_ls   = args.every_ls
+    p_k_division = args.k_division
+    p_k_dynamic = bool(args.k_dynamic)
+    p_k_random = bool(args.k_random)
+    p_ils_iteration = args.ils
+    p_ls_iteration  = args.ls
+    p_generate_only = bool(args.generate_only)
+    p_output = args.output
+
+    # load data from file and get problem size
+    X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
+
+    # create `logs` folder if necessary
+    if not os.path.exists(cfg.output_logs_folder):
+        os.makedirs(cfg.output_logs_folder)
+
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
+
+    # init solution (`n` attributes)
+    def init():
+        return BinarySolution([], problem_size).random(validator)
+
+    # define evaluate function here (need of data information)
+    def evaluate(solution):
+
+        start = datetime.datetime.now()
+
+        # get indices of filters data to use (filters selection from solution)
+        indices = []
+
+        for index, value in enumerate(solution._data): 
+            if value == 1: 
+                indices.append(index) 
+
+        print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
+
+        # keep only selected filters from solution
+        x_train_filters = X_train[:, indices]
+        x_test_filters = X_test[ :, indices]
+        
+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+        model = train_model(x_train_filters, y_train)
+
+        y_test_model = model.predict(x_test_filters)
+        y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
+        test_roc_auc = roc_auc_score(y_test, y_test_predict)
+
+        end = datetime.datetime.now()
+
+        diff = end - start
+
+        print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
+
+        return test_roc_auc
+
+    def sub_evaluate(solution, index_number, targeted_indices):
+
+        start = datetime.datetime.now()
+
+        # get indices of filters data to use (filters selection from solution)
+        indices = []
+
+        for index, value in enumerate(solution._data): 
+            if value == 1: 
+                indices.append(targeted_indices[index]) 
+
+        print(f'Training sub-model SVM n°{index_number} with {len(indices)} from {len(solution._data)} available features')
+
+        # keep only selected filters from solution
+        x_train_filters = X_train[:, indices]
+        x_test_filters = X_test[ :, indices]
+        
+        # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
+        model = train_model(x_train_filters, y_train)
+
+        y_test_model = model.predict(x_test_filters)
+        y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
+        test_roc_auc = roc_auc_score(y_test, y_test_predict)
+
+        end = datetime.datetime.now()
+
+        diff = end - start
+
+        print(f"Real sub-evaluation n°{index_number} took: {divmod(diff.days * 86400 + diff.seconds, 60)}, score found: {test_roc_auc}")
+
+        return test_roc_auc
+
+
+
+    # build all output folder and files based on `output` name
+    backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
+    surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
+    surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
+
+    if not os.path.exists(backup_model_folder):
+        os.makedirs(backup_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_model_folder):
+        os.makedirs(cfg.output_surrogates_model_folder)
+
+    if not os.path.exists(cfg.output_surrogates_data_folder):
+        os.makedirs(cfg.output_surrogates_data_folder)
+
+    backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
+    ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
+    surrogate_backup_file_path = os.path.join(backup_model_folder, p_output + '_train.csv')
+    surrogate_k_indices_backup_file_path = os.path.join(backup_model_folder, p_output + '_k_indices.csv')
+    surrogate_population_backup_file_path = os.path.join(backup_model_folder, p_output + '_population.csv')
+
+    # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
+    operators = [SimpleBinaryMutation(), SimpleMutation()]
+    #policy = UCBPolicy(operators)
+    policy = RandomPolicy(operators)
+
+    # custom start surrogate variable based on problem size
+    p_start = int(problem_size / p_k_division * 2) # 2 \times number of features for each sub-model
+
+    # fixed minimal number of real evaluations
+    if p_start < 50:
+        p_start = 50
+
+    print(f'Starting using surrogate after {p_start} reals training')
+
+    # custom ILS for surrogate use
+    algo = ILSMultiSpecificSurrogate(initalizer=init, 
+                        evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
+                        sub_evaluator=sub_evaluate,
+                        operators=operators, 
+                        policy=policy, 
+                        validator=validator,
+                        output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
+                        surrogates_file_path=surrogate_output_model,
+                        start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
+                        solutions_folder=surrogate_output_data,
+                        ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
+                        k_division=p_k_division,
+                        k_dynamic=p_k_dynamic,
+                        k_random=p_k_random,
+                        generate_only=p_generate_only,
+                        maximise=True)
+    
+    algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
+    #algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
+    algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
+    algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
+    algo.addCallback(MultiSurrogateSpecificCheckpoint(every=p_ls_iteration, filepath=surrogate_population_backup_file_path)) # try every LS like this
+
+    bestSol = algo.run(p_ils_iteration, p_ls_iteration)
+
+    # print best solution found
+    print("Found ", bestSol)
+
+    # save model information into .csv file
+    if not os.path.exists(cfg.results_information_folder):
+        os.makedirs(cfg.results_information_folder)
+
+    filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
+
+    line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness)
+    with open(filename_path, 'a') as f:
+        f.write(line_info + '\n')
+    
+    print('Result saved into %s' % filename_path)
+
+
+if __name__ == "__main__":
+    main()

+ 1 - 1
find_best_filters.py

@@ -161,7 +161,7 @@ def main():
 
 
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_filters_result_filename)
     filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_filters_result_filename)
 
 
-    line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness())
+    line_info = p_data_file + ';' + str(ils_iteration) + ';' + str(ls_iteration) + ';' + str(bestSol.data) + ';' + str(list(bestSol.data).count(1)) + ';' + str(bestSol.fitness)
     with open(filename_path, 'a') as f:
     with open(filename_path, 'a') as f:
         f.write(line_info + '\n')
         f.write(line_info + '\n')
     
     

+ 589 - 0
optimization/ILSMultiSpecificSurrogate.py

@@ -0,0 +1,589 @@
+"""Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
+"""
+
+# main imports
+import os
+import logging
+import joblib
+import time
+import math
+import numpy as np
+import pandas as pd
+import random
+
+# parallel imports
+from joblib import Parallel, delayed
+import multiprocessing
+
+# module imports
+from macop.algorithms.base import Algorithm
+from macop.solutions.discrete import BinarySolution
+
+from .LSSurrogate import LocalSearchSurrogate
+from .utils.SurrogateAnalysis import SurrogateAnalysis
+
+from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
+                                    LassoCV, ElasticNet)
+
+from wsao.sao.problems.nd3dproblem import ND3DProblem
+from wsao.sao.surrogates.walsh import WalshSurrogate
+from wsao.sao.algos.fitter import FitterAlgo
+from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
+
+class ILSMultiSpecificSurrogate(Algorithm):
+    """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate where each sub-surrogate learn from specific dataset
+
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        sub_evaluator: {function} -- sub evaluator function in order to obtained fitness for sub-model
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        ls_iteration: {int} -- number of evaluation for each local search algorithm
+        surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
+        output_log_surrogates: {str} -- Log folder for surrogates training model
+        start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
+        surrogates: [{Surrogate}] -- Surrogates model instance loaded
+        ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
+        k_division: {int} -- number of expected division for current features problem
+        k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
+        k_random: {bool} -- random initialization of k_indices for each surrogate features model data
+        generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
+        solutions_folder: {str} -- Path where real evaluated solutions on subset are saved
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def __init__(self,
+                 initalizer,
+                 evaluator,
+                 sub_evaluator,
+                 operators,
+                 policy,
+                 validator,
+                 surrogates_file_path,
+                 output_log_surrogates,
+                 start_train_surrogates,
+                 ls_train_surrogates,
+                 k_division,
+                 solutions_folder,
+                 k_random=True,
+                 k_dynamic=False,
+                 generate_only=False,
+                 maximise=True,
+                 parent=None):
+
+        # set real evaluator as default
+        super().__init__(initalizer, evaluator, operators, policy,
+                validator, maximise, parent)
+
+        self._n_local_search = 0
+        self._total_n_local_search = 0
+        self._main_evaluator = evaluator
+        self._sub_evaluator = sub_evaluator
+
+        self._surrogates_file_path = surrogates_file_path
+        self._start_train_surrogates = start_train_surrogates
+        self._output_log_surrogates = output_log_surrogates
+
+        self._surrogate_evaluator = None
+        self._surrogate_analyser = None
+
+        self._ls_train_surrogates = ls_train_surrogates
+
+        self._k_division = k_division
+        self._k_dynamic = k_dynamic
+        self._k_random = k_random
+        self._k_indices = None
+        self._surrogates = None
+        self._population = None
+
+        self._generate_only = generate_only
+        self._solutions_folder = solutions_folder
+        
+
+    def init_solutions_files(self):
+        self._solutions_files = []
+
+        if not os.path.exists(self._solutions_folder):
+            os.makedirs(self._solutions_folder)
+
+        # for each sub surrogate, associate its own surrogate file
+        for i in range(len(self._k_indices)):
+            index_str = str(i)
+
+            while len(index_str) < 3:
+                index_str = "0" + index_str
+
+            solutions_path = os.path.join(self._solutions_folder, f'surrogate_data_{index_str}')
+
+            # initialize solutions file if not exist
+            if not os.path.exists(solutions_path):
+                with open(solutions_path, 'w') as f:
+                    f.write('x;y\n')
+
+            self._solutions_files.append(solutions_path)
+
+
+    def define_sub_evaluators(self): 
+        self._sub_evaluators = []
+
+        for i in range(len(self._k_indices)):
+
+            # need to pass as default argument indices
+            current_evaluator = lambda s, number=i, indices=self._k_indices[i]: self._sub_evaluator(s, number, indices)
+            self._sub_evaluators.append(current_evaluator)
+
+
+    def init_population(self):
+
+        self._population = []
+
+        # initialize the population
+        for i in range(len(self._k_indices)):
+            
+            current_solution = self.pop_initializer(i)
+
+            # compute fitness using sub-problem evaluator
+            fitness_score = self._sub_evaluators[i](current_solution)
+            current_solution._score = fitness_score
+            
+            self._population.append(current_solution)
+
+
+    def pop_initializer(self, index):
+        problem_size = len(self._k_indices[index])
+        return BinarySolution([], problem_size).random(self._validator)
+
+
+    def init_k_split_indices(self):
+        """Initialize k_indices for the new training of surrogate
+
+        Returns:
+            k_indices: [description]
+        """
+        a = list(range(self._bestSolution._size))
+        n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
+
+        # TODO : (check) if random is possible or not
+        # if self._k_random:
+        #     random.shuffle(a) # random subset
+
+        splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
+
+        self._k_division = len(splitted_indices) # update size of k if necessary
+        self._k_indices = splitted_indices
+
+
+    def train_surrogate(self, index, indices):
+        
+        # 1. Data sets preparation (train and test) use now of specific dataset for surrogate
+        
+        # dynamic number of samples based on dataset real evaluations
+        nsamples = None
+        with open(self._solutions_files[index], 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
+        
+        df = pd.read_csv(self._solutions_files[index], sep=';')
+        # learning set and test set
+        current_learn = df.sample(training_samples)
+        current_test = df.drop(current_learn.index)
+
+        problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
+        model = Lasso(alpha=1e-5)
+        surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
+        analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+        print(f"Start fitting again the surrogate model n°{index}, using {training_samples} of {nsamples} samples for train dataset")
+        for r in range(10):
+            print(f"Iteration n°{r}: for fitting surrogate n°{index}")
+            algo.run_samples(learn=current_learn, test=current_test, step=10)
+
+        # keep well ordered surrogate into file manager
+        str_index = str(index)
+
+        while len(str_index) < 6:
+            str_index = "0" + str_index
+
+        joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
+
+        return str_index
+        
+
+    def train_surrogates(self):
+        """Retrain if necessary the whole surrogate fitness approximation function
+        """
+        # Following https://gitlab.com/florianlprt/wsao, we re-train the model
+        # ---------------------------------------------------------------------------
+        # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
+        #        model=lasso,alpha=1e-5 \
+        #        surrogate=walsh,order=3 \
+        #        algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
+        #        sample=1000,step=10 \
+        #        analysis=fitter,logfile=out_fit.csv
+        
+        # 1. for each sub space indices, learn new surrogate
+        if not os.path.exists(self._surrogates_file_path):
+            os.makedirs(self._surrogates_file_path)
+
+        num_cores = multiprocessing.cpu_count()
+
+        if not os.path.exists(self._output_log_surrogates):
+            os.makedirs(self._output_log_surrogates)
+
+        Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, indices) for index, indices in enumerate(self._k_indices))
+
+
+    def load_surrogates(self):
+        """Load algorithm with surrogate model and create lambda evaluator function
+        """
+
+        # need to first train surrogate if not exist
+        if not os.path.exists(self._surrogates_file_path):
+            self.train_surrogates()
+
+        self._surrogates = []
+
+        surrogates_path = sorted(os.listdir(self._surrogates_file_path))
+
+        for surrogate_p in surrogates_path:
+            model_path = os.path.join(self._surrogates_file_path, surrogate_p)
+            surrogate_model = joblib.load(model_path)
+
+            self._surrogates.append(surrogate_model)
+
+    
+    def surrogate_evaluator(self, solution):
+        """Compute mean of each surrogate model using targeted indices
+
+        Args:
+            solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
+
+        Return:
+            mean: {float} -- mean score of surrogate models
+        """
+        scores = []
+        solution_data = np.array(solution._data)
+
+        # for each indices set, get trained surrogate model and made prediction score
+        for i, indices in enumerate(self._k_indices):
+            current_data = solution_data[indices]
+            current_score = self._surrogates[i].surrogate.predict([current_data])[0]
+            scores.append(current_score)
+
+        return sum(scores) / len(scores)
+            
+    def surrogates_coefficient_of_determination(self):
+        """Compute r² for each sub surrogate model
+
+        Return:
+            r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
+        """
+
+        # for each indices set, get r^2 surrogate model and made prediction score
+        num_cores = multiprocessing.cpu_count()
+
+        r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
+
+        return r_squared_scores
+
+    def surrogates_mae(self):
+        """Compute mae for each sub surrogate model
+
+        Return:
+            mae_scores: [{float}] -- mae scores from model
+        """
+
+        # for each indices set, get mae surrogate model and made prediction score
+        num_cores = multiprocessing.cpu_count()
+
+        mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
+
+
+        return mae_scores
+
+    def add_to_surrogate(self, solution, index):
+
+        # save real evaluated solution into specific file for surrogate
+        with open(self._solutions_files[index], 'a') as f:
+
+            line = ""
+
+            for index, e in enumerate(solution._data):
+
+                line += str(e)
+                
+                if index < len(solution._data) - 1:
+                    line += ","
+
+            line += ";"
+            line += str(solution._score)
+
+            f.write(line + "\n")
+
+    def run(self, evaluations, ls_evaluations=100):
+        """
+        Run the iterated local search algorithm using local search (EvE compromise)
+
+        Args:
+            evaluations: {int} -- number of global evaluations for ILS
+            ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(evaluations)
+
+        # initialize current solution
+        self.initRun()
+
+        self.init_k_split_indices()
+
+        # add norm to indentify sub problem data
+        self.init_solutions_files()
+
+        # here we each surrogate sub evaluator
+        self.define_sub_evaluators()
+        self.init_population()
+
+        # enable resuming for ILS
+        self.resume()
+
+        # count number of surrogate obtained and restart using real evaluations done for each surrogate (sub-model)
+        if (self._start_train_surrogates * self._k_division) > self.getGlobalEvaluation():
+
+            # for each sub problem (surrogate)
+            for i in range(self._k_division):
+
+                nsamples = None
+                with open(self._solutions_files[i], 'r') as f:
+                    nsamples = len(f.readlines()) - 1 # avoid header
+
+                if nsamples is None:
+                    nsamples = 0
+
+                # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
+                # using randomly generated solutions (in order to cover seearch space)
+                while self._start_train_surrogates > nsamples:
+
+                    print(f'Real solutions extraction for surrogate n°{i}: {nsamples} of {self._start_train_surrogates}')
+                    
+                    newSolution = self.pop_initializer(i)
+
+                    # evaluate new solution
+                    newSolution.evaluate(self._sub_evaluators[i])
+
+                    # add it to surrogate pool
+                    self.add_to_surrogate(newSolution, i)
+
+                    nsamples += 1
+
+                    # increase number of evaluation
+                    self.increaseEvaluation()
+                
+        # stop this process after generating solution
+        if self._generate_only:
+            return self._bestSolution
+
+        # train surrogate on real evaluated solutions file
+        self.train_surrogates()
+        self.load_surrogates()
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            # set current evaluator based on used or not of surrogate function
+            self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
+
+
+            local_search_list = [] 
+
+            for i in range(self._k_division):
+
+                # use specific initializer for pop_initialiser
+                # specific surrogate evaluator for this local search
+                ls = LocalSearchSurrogate(lambda index=i: self.pop_initializer(index),
+                            lambda s: self._surrogates[i].surrogate.predict([s._data])[0],
+                            self._operators,
+                            self._policy,
+                            self._validator,
+                            self._maximise,
+                            parent=self)
+
+                # add same callbacks
+                for callback in self._callbacks:
+                    ls.addCallback(callback)
+
+                local_search_list.append(ls)
+
+            # parallel run of each local search
+            num_cores = multiprocessing.cpu_count()
+            ls_solutions = Parallel(n_jobs=num_cores)(delayed(ls.run)(ls_evaluations) for ls in local_search_list)
+
+            # create and search solution from local search
+            self._numberOfEvaluations += ls_evaluations * self._k_division
+
+            # for each sub problem, update population
+            for i, sub_problem_solution in enumerate(ls_solutions):
+
+                # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
+                # Update : always add new solution into surrogate pool, not only if solution is better
+                #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
+                if self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                    # if better solution found from local search, retrained the found solution and test again
+                    # without use of surrogate
+                    fitness_score = self._sub_evaluators[i](sub_problem_solution)
+                    # self.increaseEvaluation() # dot not add evaluation
+
+                    sub_problem_solution._score = fitness_score
+
+                    # if solution is really better after real evaluation, then we replace (depending of problem nature (minimizing / maximizing))
+                    if self._maximise:
+                        if sub_problem_solution.fitness > self._population[i].fitness:
+                            self._population[i] = sub_problem_solution
+                    else:
+                        if sub_problem_solution.fitness < self._population[i].fitness:
+                            self._population[i] = sub_problem_solution
+
+                    self.add_to_surrogate(sub_problem_solution, i)
+            
+            # main best solution update
+            if self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                # need to create virtual solution from current population
+                obtained_solution_data = np.array([ s._data for s in self._population ], dtype='object').flatten().tolist()
+
+                if list(obtained_solution_data) == list(self._bestSolution._data):
+                    print(f'-- No updates found from sub-model surrogates LS (best solution score: {self._bestSolution._score}')
+                else:
+                    print(f'-- Updates found into population from sub-model surrogates LS')
+                    # init random solution 
+                    current_solution = self._initializer()
+                    current_solution.data = obtained_solution_data
+
+                    fitness_score = self._main_evaluator(current_solution)
+
+                    # new computed solution score
+                    current_solution._score = fitness_score
+
+                    # if solution is really better after real evaluation, then we replace
+                    if self.isBetter(current_solution):
+                        self._bestSolution = current_solution
+
+                    print(f'-- Current main solution from population is {current_solution._score} vs. {self._bestSolution._score}')
+                    self.progress()
+
+            # main best solution update
+            if self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                # need to create virtual solution from current population
+                obtained_solution_data = np.array([ s._data for s in ls_solutions ], dtype='object').flatten().tolist()
+
+                if list(obtained_solution_data) == list(self._bestSolution._data):
+                    print(f'-- No updates found from sub-model surrogates LS (best solution score: {self._bestSolution._score}')
+                else:
+                    print(f'-- Updates found from sub-model surrogates LS')
+                    # init random solution 
+                    current_solution = self._initializer()
+                    current_solution.data = obtained_solution_data
+
+                    fitness_score = self._main_evaluator(current_solution)
+
+                    # new computed solution score
+                    current_solution._score = fitness_score
+
+                    # if solution is really better after real evaluation, then we replace
+                    if self.isBetter(current_solution):
+
+                        print(f'Exploration solution obtained from LS surrogates enable improvment of main solution')
+                        self._bestSolution = current_solution
+
+                        print(f'Exploration solution obtained from LS surrogates enable improvment of main solution')
+                        # also update the whole population as restarting process if main solution is better
+                        for i, sub_problem_solution in enumerate(ls_solutions):
+
+                            # already evaluated sub solution
+                            self._population[i] = sub_problem_solution
+
+                    print(f'-- Current main solution obtained from `LS solutions` is {current_solution._score} vs. {self._bestSolution._score}')
+                    logging.info(f'-- Current main solution obtained from `LS solutions` is {current_solution._score} vs. {self._bestSolution._score}')
+                    self.progress()
+    
+            print(f'State of current population for surrogates ({len(self._population)} members)')
+            for i, s in enumerate(self._population):
+                print(f'Population[{i}]: best solution fitness is {s.fitness}')
+
+            # check using specific dynamic criteria based on r^2
+            r_squared_scores = self.surrogates_coefficient_of_determination()
+            r_squared = sum(r_squared_scores) / len(r_squared_scores)
+
+            mae_scores = self.surrogates_mae()
+            mae_score = sum(mae_scores) / len(mae_scores)
+
+            r_squared_value = 0 if r_squared < 0 else r_squared
+
+            training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
+
+            # avoid issue when lauching every each local search
+            if training_surrogate_every <= 0:
+                training_surrogate_every = 1
+                
+            logging.info(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
+            print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
+            
+            # check if necessary or not to train again surrogate
+            if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                # reinitialization of k_indices for the new training
+                # TODO : remove this part temporally
+                # if self._k_dynamic:
+                #     print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
+                #     self.init_k_split_indices()
+
+                # train again surrogate on real evaluated solutions file
+                start_training = time.time()
+                self.train_surrogates()
+                training_time = time.time() - start_training
+
+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
+
+                # reload new surrogate function
+                self.load_surrogates()
+
+                # reinitialize number of local search
+                self._n_local_search = 0
+
+            # increase number of local search done
+            self._n_local_search += 1
+            self._total_n_local_search += 1
+
+            self.information()
+
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
+
+        self.end()
+        return self._bestSolution
+
+    def addCallback(self, callback):
+        """Add new callback to algorithm specifying usefull parameters
+
+        Args:
+            callback: {Callback} -- specific Callback instance
+        """
+        # specify current main algorithm reference
+        if self.getParent() is not None:
+            callback.setAlgo(self.getParent())
+        else:
+            callback.setAlgo(self)
+
+        # set as new
+        self._callbacks.append(callback)

+ 454 - 0
optimization/ILSMultiSurrogate.py

@@ -0,0 +1,454 @@
+"""Iterated Local Search Algorithm implementation using multiple-surrogate (weighted sum surrogate) as fitness approximation
+"""
+
+# main imports
+import os
+import logging
+import joblib
+import time
+import math
+import numpy as np
+import pandas as pd
+import random
+
+# parallel imports
+from joblib import Parallel, delayed
+import multiprocessing
+
+# module imports
+from macop.algorithms.base import Algorithm
+from .LSSurrogate import LocalSearchSurrogate
+from .utils.SurrogateAnalysis import SurrogateAnalysis
+
+from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
+                                    LassoCV, ElasticNet)
+
+from wsao.sao.problems.nd3dproblem import ND3DProblem
+from wsao.sao.surrogates.walsh import WalshSurrogate
+from wsao.sao.algos.fitter import FitterAlgo
+from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
+
+class ILSMultiSurrogate(Algorithm):
+    """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using multiple-surrogate
+
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        ls_iteration: {int} -- number of evaluation for each local search algorithm
+        surrogates_file_path: {str} -- Surrogates model folder to load (models trained using https://gitlab.com/florianlprt/wsao)
+        output_log_surrogates: {str} -- Log folder for surrogates training model
+        start_train_surrogates: {int} -- number of evaluation expected before start training and use surrogate
+        surrogates: [{Surrogate}] -- Surrogates model instance loaded
+        ls_train_surrogates: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
+        k_division: {int} -- number of expected division for current features problem
+        k_dynamic: {bool} -- specify if indices are changed for each time we train a new surrogate model
+        k_random: {bool} -- random initialization of k_indices for each surrogate features model data
+        generate_only: {bool} -- generate only a specific number of expected real solutions evaluated
+        solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def __init__(self,
+                 initalizer,
+                 evaluator,
+                 operators,
+                 policy,
+                 validator,
+                 surrogates_file_path,
+                 output_log_surrogates,
+                 start_train_surrogates,
+                 ls_train_surrogates,
+                 k_division,
+                 solutions_file,
+                 k_random=True,
+                 k_dynamic=False,
+                 generate_only=False,
+                 maximise=True,
+                 parent=None):
+
+        # set real evaluator as default
+        super().__init__(initalizer, evaluator, operators, policy,
+                validator, maximise, parent)
+
+        self._n_local_search = 0
+        self._total_n_local_search = 0
+        self._main_evaluator = evaluator
+
+        self._surrogates_file_path = surrogates_file_path
+        self._start_train_surrogates = start_train_surrogates
+        self._output_log_surrogates = output_log_surrogates
+
+        self._surrogate_evaluator = None
+        self._surrogate_analyser = None
+
+        self._ls_train_surrogates = ls_train_surrogates
+        self._solutions_file = solutions_file
+
+        self._k_division = k_division
+        self._k_dynamic = k_dynamic
+        self._k_random = k_random
+        self._k_indices = None
+        self._surrogates = None
+
+        self._generate_only = generate_only
+
+    def init_k_split_indices(self):
+        """Initialize k_indices for the new training of surrogate
+
+        Returns:
+            k_indices: [description]
+        """
+        a = list(range(self._bestSolution._size))
+        n_elements = int(math.ceil(self._bestSolution._size / self._k_division)) # use of ceil to avoid loss of data
+
+        if self._k_random:
+            random.shuffle(a) # random subset
+
+        splitted_indices = [a[x:x+n_elements] for x in range(0, len(a), n_elements)]
+
+        return splitted_indices
+
+
+    def train_surrogate(self, index, learn, test, indices):
+
+        current_learn = learn.copy()
+        current_learn.x = current_learn.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
+
+        current_test = test.copy()
+        current_test.x = current_test.x.apply(lambda x: ','.join(list(map(str, np.fromstring(x, dtype=int, sep=',')[indices]))))
+
+        problem = ND3DProblem(size=len(indices)) # problem size based on best solution size (need to improve...)
+        model = Lasso(alpha=1e-5)
+        surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
+        analysis = FitterAnalysis(logfile=os.path.join(self._output_log_surrogates, f"train_surrogate_{index}.log"), problem=problem)
+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+        print(f"Start fitting again the surrogate model n°{index}")
+        for r in range(10):
+            print(f"Iteration n°{r}: for fitting surrogate n°{index}")
+            algo.run_samples(learn=current_learn, test=current_test, step=10)
+
+        # keep well ordered surrogate into file manager
+        str_index = str(index)
+
+        while len(str_index) < 6:
+            str_index = "0" + str_index
+
+        joblib.dump(algo, os.path.join(self._surrogates_file_path, f'surrogate_{str_index}'))
+
+        return str_index
+        
+
+    def train_surrogates(self):
+        """Retrain if necessary the whole surrogate fitness approximation function
+        """
+        # Following https://gitlab.com/florianlprt/wsao, we re-train the model
+        # ---------------------------------------------------------------------------
+        # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
+        #        model=lasso,alpha=1e-5 \
+        #        surrogate=walsh,order=3 \
+        #        algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
+        #        sample=1000,step=10 \
+        #        analysis=fitter,logfile=out_fit.csv
+
+
+        # 1. Data sets preparation (train and test)
+        
+        # dynamic number of samples based on dataset real evaluations
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
+        
+        df = pd.read_csv(self._solutions_file, sep=';')
+        # learning set and test set
+        learn = df.sample(training_samples)
+        test = df.drop(learn.index)
+
+        print(f'Training all surrogate models using {training_samples} of {nsamples} samples for train dataset')
+        
+        # 2. for each sub space indices, learn new surrogate
+        if not os.path.exists(self._surrogates_file_path):
+            os.makedirs(self._surrogates_file_path)
+
+        num_cores = multiprocessing.cpu_count()
+
+        if not os.path.exists(self._output_log_surrogates):
+            os.makedirs(self._output_log_surrogates)
+
+        Parallel(n_jobs=num_cores)(delayed(self.train_surrogate)(index, learn, test, indices) for index, indices in enumerate(self._k_indices))
+
+
+    def load_surrogates(self):
+        """Load algorithm with surrogate model and create lambda evaluator function
+        """
+
+        # need to first train surrogate if not exist
+        if not os.path.exists(self._surrogates_file_path):
+            self.train_surrogates()
+
+        self._surrogates = []
+
+        surrogates_path = sorted(os.listdir(self._surrogates_file_path))
+
+        for surrogate_p in surrogates_path:
+            model_path = os.path.join(self._surrogates_file_path, surrogate_p)
+            surrogate_model = joblib.load(model_path)
+
+            self._surrogates.append(surrogate_model)
+
+    
+    def surrogate_evaluator(self, solution):
+        """Compute mean of each surrogate model using targeted indices
+
+        Args:
+            solution: {Solution} -- current solution to evaluate using multi-surrogate evaluation
+
+        Return:
+            mean: {float} -- mean score of surrogate models
+        """
+        scores = []
+        solution_data = np.array(solution._data)
+
+        # for each indices set, get trained surrogate model and made prediction score
+        for i, indices in enumerate(self._k_indices):
+            current_data = solution_data[indices]
+            current_score = self._surrogates[i].surrogate.predict([current_data])[0]
+            scores.append(current_score)
+
+        return sum(scores) / len(scores)
+            
+    def surrogates_coefficient_of_determination(self):
+        """Compute r² for each sub surrogate model
+
+        Return:
+            r_squared_scores: [{float}] -- mean score of r_squred obtained from surrogate models
+        """
+
+        # for each indices set, get r^2 surrogate model and made prediction score
+
+        num_cores = multiprocessing.cpu_count()
+
+        r_squared_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.coefficient_of_determination)(s_model.surrogate) for s_model in self._surrogates)
+
+        # for i, _ in enumerate(self._k_indices):
+        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
+        #     r_squared_scores.append(r_squared)
+
+        #print(r_squared_scores)
+
+        return r_squared_scores
+
+    def surrogates_mae(self):
+        """Compute mae for each sub surrogate model
+
+        Return:
+            mae_scores: [{float}] -- mae scores from model
+        """
+
+        # for each indices set, get r^2 surrogate model and made prediction score
+
+        num_cores = multiprocessing.cpu_count()
+
+        mae_scores = Parallel(n_jobs=num_cores)(delayed(s_model.analysis.mae)(s_model.surrogate) for s_model in self._surrogates)
+
+        # for i, _ in enumerate(self._k_indices):
+        #     r_squared = self._surrogates[i].analysis.coefficient_of_determination(self._surrogates[i].surrogate)
+        #     r_squared_scores.append(r_squared)
+
+        #print(mae_scores)
+
+        return mae_scores
+
+    def add_to_surrogate(self, solution):
+
+        # save real evaluated solution into specific file for surrogate
+        with open(self._solutions_file, 'a') as f:
+
+            line = ""
+
+            for index, e in enumerate(solution._data):
+
+                line += str(e)
+                
+                if index < len(solution._data) - 1:
+                    line += ","
+
+            line += ";"
+            line += str(solution._score)
+
+            f.write(line + "\n")
+
+    def run(self, evaluations, ls_evaluations=100):
+        """
+        Run the iterated local search algorithm using local search (EvE compromise)
+
+        Args:
+            evaluations: {int} -- number of global evaluations for ILS
+            ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(evaluations)
+
+        # initialize current solution
+        self.initRun()
+
+        # based on best solution found, initialize k pool indices
+        if self._k_indices == None:
+            self._k_indices = self.init_k_split_indices()
+
+        # enable resuming for ILS
+        self.resume()
+
+        # count number of surrogate obtained and restart using real evaluations done
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        if self.getGlobalEvaluation() < nsamples:
+            print(f'Restart using {nsamples} of {self._start_train_surrogates} real evaluations obtained')
+            self._numberOfEvaluations = nsamples
+
+        if self._start_train_surrogates > self.getGlobalEvaluation():
+        
+            # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
+            # using randomly generated solutions (in order to cover seearch space)
+            while self._start_train_surrogates > self.getGlobalEvaluation():
+
+                print(f'Real solutions extraction {self.getGlobalEvaluation()} of {self._start_train_surrogates}')
+                
+                newSolution = self._initializer()
+
+                # evaluate new solution
+                newSolution.evaluate(self._evaluator)
+
+                # add it to surrogate pool
+                self.add_to_surrogate(newSolution)
+
+                self.increaseEvaluation()
+
+        # stop this process after generating solution
+        if self._generate_only:
+            return self._bestSolution
+
+        # train surrogate on real evaluated solutions file
+        self.train_surrogates()
+        self.load_surrogates()
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            # set current evaluator based on used or not of surrogate function
+            self._evaluator = self.surrogate_evaluator if self._start_train_surrogates <= self.getGlobalEvaluation() else self._main_evaluator
+
+            # create new local search instance
+            # passing global evaluation param from ILS
+            ls = LocalSearchSurrogate(self._initializer,
+                         self._evaluator,
+                         self._operators,
+                         self._policy,
+                         self._validator,
+                         self._maximise,
+                         parent=self)
+
+            # add same callbacks
+            for callback in self._callbacks:
+                ls.addCallback(callback)
+
+            # create and search solution from local search
+            newSolution = ls.run(ls_evaluations)
+
+            # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
+            # Update : always add new solution into surrogate pool, not only if solution is better
+            #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
+            if self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                # if better solution found from local search, retrained the found solution and test again
+                # without use of surrogate
+                fitness_score = self._main_evaluator(newSolution)
+                # self.increaseEvaluation() # dot not add evaluation
+
+                newSolution._score = fitness_score
+
+                # if solution is really better after real evaluation, then we replace
+                if self.isBetter(newSolution):
+                    self._bestSolution = newSolution
+
+                self.add_to_surrogate(newSolution)
+
+                self.progress()
+
+            # check using specific dynamic criteria based on r^2
+            r_squared_scores = self.surrogates_coefficient_of_determination()
+            r_squared = sum(r_squared_scores) / len(r_squared_scores)
+
+            mae_scores = self.surrogates_mae()
+            mae_score = sum(mae_scores) / len(mae_scores)
+
+            r_squared_value = 0 if r_squared < 0 else r_squared
+
+            training_surrogate_every = int(r_squared_value * self._ls_train_surrogates) # use of absolute value for r²
+
+            # avoid issue when lauching every each local search
+            if training_surrogate_every <= 0:
+                training_surrogate_every = 1
+                
+            print(f"=> R² of surrogate is of {r_squared} | MAE is of {mae_score} -- [Retraining model after {self._n_local_search % training_surrogate_every} of {training_surrogate_every} LS]")
+
+            # check if necessary or not to train again surrogate
+            if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogates <= self.getGlobalEvaluation():
+
+                # reinitialization of k_indices for the new training
+                if self._k_dynamic:
+                    print(f"Reinitialization of k_indices using `k={self._k_division} `for the new training")
+                    self.init_k_split_indices()
+
+                # train again surrogate on real evaluated solutions file
+                start_training = time.time()
+                self.train_surrogates()
+                training_time = time.time() - start_training
+
+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared_scores, r_squared, mae_scores, mae_score, self.getGlobalMaxEvaluation(), self._total_n_local_search)
+
+                # reload new surrogate function
+                self.load_surrogates()
+
+                # reinitialize number of local search
+                self._n_local_search = 0
+
+            # increase number of local search done
+            self._n_local_search += 1
+            self._total_n_local_search += 1
+
+            self.information()
+
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
+
+        self.end()
+        return self._bestSolution
+
+    def addCallback(self, callback):
+        """Add new callback to algorithm specifying usefull parameters
+
+        Args:
+            callback: {Callback} -- specific Callback instance
+        """
+        # specify current main algorithm reference
+        if self.getParent() is not None:
+            callback.setAlgo(self.getParent())
+        else:
+            callback.setAlgo(self)
+
+        # set as new
+        self._callbacks.append(callback)

+ 348 - 0
optimization/ILSPopSurrogate.py

@@ -0,0 +1,348 @@
+"""Iterated Local Search Algorithm implementation using surrogate as fitness approximation
+"""
+
+# main imports
+import os
+import logging
+import joblib
+import time
+
+# module imports
+from macop.algorithms.base import Algorithm
+from macop.evaluators.base import Evaluator
+from macop.operators.base import KindOperator
+from macop.policies.reinforcement import UCBPolicy
+
+from macop.callbacks.policies import UCBCheckpoint
+
+from .LSSurrogate import LocalSearchSurrogate
+from .utils.SurrogateAnalysis import SurrogateAnalysisMono
+
+from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
+                                    LassoCV, ElasticNet)
+
+from wsao.sao.problems.nd3dproblem import ND3DProblem
+from wsao.sao.surrogates.walsh import WalshSurrogate
+from wsao.sao.algos.fitter import FitterAlgo
+from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
+
+
+class LSSurrogateEvaluator(Evaluator):
+
+    # use of surrogate in order to evaluate solution
+    def compute(self, solution):
+        return self._data['surrogate'].surrogate.predict([solution.data])[0]
+        
+
+class ILSPopSurrogate(Algorithm):
+    """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
+
+
+    Attributes:
+        initalizer: {function} -- basic function strategy to initialize solution
+        evaluator: {function} -- basic function in order to obtained fitness (mono or multiple objectives)
+        operators: {[Operator]} -- list of operator to use when launching algorithm
+        policy: {Policy} -- Policy class implementation strategy to select operators
+        validator: {function} -- basic function to check if solution is valid or not under some constraints
+        maximise: {bool} -- specify kind of optimization problem 
+        currentSolution: {Solution} -- current solution managed for current evaluation
+        bestSolution: {Solution} -- best solution found so far during running algorithm
+        ls_iteration: {int} -- number of evaluation for each local search algorithm
+        population_size: {int} -- size of the population to manage
+        surrogate_file: {str} -- Surrogate model file to load (model trained using https://gitlab.com/florianlprt/wsao)
+        start_train_surrogate: {int} -- number of evaluation expected before start training and use surrogate
+        surrogate: {Surrogate} -- Surrogate model instance loaded
+        ls_train_surrogate: {int} -- Specify if we need to retrain our surrogate model (every Local Search)
+        solutions_file: {str} -- Path where real evaluated solutions are saved in order to train surrogate again
+        callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
+    """
+    def __init__(self,
+                 initalizer,
+                 evaluator,
+                 operators,
+                 policy,
+                 validator,
+                 population_size,
+                 surrogate_file_path,
+                 start_train_surrogate,
+                 ls_train_surrogate,
+                 walsh_order,
+                 inter_policy_ls_file,
+                 solutions_file,
+                 maximise=True,
+                 parent=None):
+
+        # set real evaluator as default
+        super().__init__(initalizer, evaluator, operators, policy,
+                validator, maximise, parent)
+
+        self._n_local_search = 0
+        self._main_evaluator = evaluator
+
+        self._surrogate_file_path = surrogate_file_path
+        self._start_train_surrogate = start_train_surrogate
+
+        self._surrogate_evaluator = None
+        self._surrogate_analyser = None
+
+        self._ls_train_surrogate = ls_train_surrogate
+        self._solutions_file = solutions_file
+
+        self._walsh_order = walsh_order
+        self._inter_policy_ls_file = inter_policy_ls_file
+
+        # default population values
+        self.population_size = population_size
+        self.population = []
+
+        for _ in range(self.population_size):
+            self.population.append(None)
+
+    def train_surrogate(self):
+        """Retrain if necessary the whole surrogate fitness approximation function
+        """
+        # Following https://gitlab.com/florianlprt/wsao, we re-train the model
+        # ---------------------------------------------------------------------------
+        # cli_restart.py problem=nd3d,size=30,filename="data/statistics_extended_svdn" \
+        #        model=lasso,alpha=1e-5 \
+        #        surrogate=walsh,order=3 \
+        #        algo=fitter,algo_restarts=10,samplefile=stats_extended.csv \
+        #        sample=1000,step=10 \
+        #        analysis=fitter,logfile=out_fit.csv
+
+        problem = ND3DProblem(size=len(self._bestSolution.data)) # problem size based on best solution size (need to improve...)
+        model = Lasso(alpha=1e-5)
+        surrogate = WalshSurrogate(order=self._walsh_order, size=problem.size, model=model)
+        analysis = FitterAnalysis(logfile="train_surrogate.log", problem=problem)
+        algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
+
+        # dynamic number of samples based on dataset real evaluations
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
+        
+        print("Start fitting again the surrogate model")
+        print(f'Using {training_samples} of {nsamples} samples for train dataset')
+        for r in range(10):
+            print(f"Iteration n°{r}: for fitting surrogate")
+            algo.run(samplefile=self._solutions_file, sample=training_samples, step=10)
+
+        joblib.dump(algo, self._surrogate_file_path)
+
+
+    def load_surrogate(self):
+        """Load algorithm with surrogate model and create lambda evaluator function
+        """
+
+        # need to first train surrogate if not exist
+        if not os.path.exists(self._surrogate_file_path):
+            self.train_surrogate()
+
+        self._surrogate = joblib.load(self._surrogate_file_path)
+
+        # update evaluator function
+        self._surrogate_evaluator = LSSurrogateEvaluator(data={'surrogate': self._surrogate})
+
+    def add_to_surrogate(self, solution):
+
+        # save real evaluated solution into specific file for surrogate
+        with open(self._solutions_file, 'a') as f:
+
+            line = ""
+
+            for index, e in enumerate(solution._data):
+
+                line += str(e)
+                
+                if index < len(solution._data) - 1:
+                    line += ","
+
+            line += ";"
+            line += str(solution._score)
+
+            f.write(line + "\n")
+
+    def initRun(self):
+
+        fitness_scores = []
+        print('Initialisation of @population')
+        for i in range(len(self.population)):
+
+            print(f'  - solution [{(i+1)}] of {len(self.population)}')
+            if self.population[i] is None:
+                solution = self.initialiser()
+                solution.evaluate(self.evaluator)
+
+                self.population[i] = solution
+                self.add_to_surrogate(solution)
+
+            self.increaseEvaluation()
+
+            fitness_scores.append(self.population[i].fitness)
+
+        print('Best solution @initialisation')
+        self._bestSolution = self.population[fitness_scores.index(max(fitness_scores))]
+
+
+    def run(self, evaluations, ls_evaluations=100):
+        """
+        Run the iterated local search algorithm using local search (EvE compromise)
+
+        Args:
+            evaluations: {int} -- number of global evaluations for ILS
+            ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+
+        Returns:
+            {Solution} -- best solution found
+        """
+
+        # by default use of mother method to initialize variables
+        super().run(evaluations)
+
+        # enable resuming for ILS
+        self.resume()
+
+        # initialize current solution
+        self.initRun()
+
+        # count number of surrogate obtained and restart using real evaluations done
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        if self.getGlobalEvaluation() < nsamples:
+            print(f'Restart using {nsamples} of {self._start_train_surrogate} real evaluations obtained')
+            self._numberOfEvaluations = nsamples
+
+        if self._start_train_surrogate > self.getGlobalEvaluation():
+        
+            # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
+            # using randomly generated solutions (in order to cover seearch space)
+            while self._start_train_surrogate > self.getGlobalEvaluation():
+                
+                newSolution = self.initialiser()
+
+                # evaluate new solution
+                newSolution.evaluate(self.evaluator)
+
+                # add it to surrogate pool
+                self.add_to_surrogate(newSolution)
+
+                self.increaseEvaluation()
+
+        # train surrogate on real evaluated solutions file
+        self.train_surrogate()
+        self.load_surrogate()
+
+        # local search algorithm implementation
+        while not self.stop():
+
+            # set current evaluator based on used or not of surrogate function
+            self.evaluator = self._surrogate_evaluator if self._start_train_surrogate <= self.getGlobalEvaluation() else self._main_evaluator
+
+            for i in range(len(self.population)):
+
+                # pass only Mutators operators for local search
+                selected_operators = [ op for op in self._operators if op._kind == KindOperator.MUTATOR ]
+
+                ls_policy = UCBPolicy(selected_operators, C=100, exp_rate=0.1)
+                # create new local search instance
+                # passing global evaluation param from ILS
+                ls = LocalSearchSurrogate(self.initialiser,
+                            self.evaluator,
+                            selected_operators,
+                            ls_policy,
+                            self.validator,
+                            self._maximise,
+                            parent=None,
+                            verbose=False)
+
+                ls.addCallback(UCBCheckpoint(every=1, filepath=self._inter_policy_ls_file))
+
+                # create current new solution using policy and custom algorithm init
+                ls._currentSolution = self.policy.apply(self.population[i])
+                ls.result = ls._currentSolution
+
+                # add same callbacks
+                #for callback in self._callbacks:
+                #    ls.addCallback(callback)
+
+                # create and search solution from local search
+                newSolution = ls.run(ls_evaluations)
+
+                # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
+                # Update : always add new solution into surrogate pool, not only if solution is better
+                #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
+                if self._start_train_surrogate <= self.getGlobalEvaluation():
+
+                    # if better solution found from local search, retrained the found solution and test again
+                    # without use of surrogate
+                    fitness_score = self._main_evaluator.compute(newSolution)
+                    # self.increaseEvaluation() # dot not add evaluation
+
+                    newSolution.fitness = fitness_score
+
+                    # if solution is really better after real evaluation, then we replace
+                    if self.isBetter(newSolution):
+                        self.result = newSolution
+
+                    # update population
+                    if self.population[i].fitness < newSolution.fitness:
+                        self.population[i] = newSolution
+
+                    self.add_to_surrogate(newSolution)
+
+                    self.progress()
+
+                print(f'Best solution found so far: {self.result.fitness}')
+
+                # check using specific dynamic criteria based on r^2
+                r_squared = self._surrogate.analysis.coefficient_of_determination(self._surrogate.surrogate)
+                mae = self._surrogate.analysis.mae(self._surrogate.surrogate)
+                training_surrogate_every = int(r_squared * self._ls_train_surrogate)
+                print(f"=> R^2 of surrogate is of {r_squared}. Retraining model every {training_surrogate_every} LS")
+                print(f"=> MAE of surrogate is of {mae}. Retraining model every {training_surrogate_every} LS")
+
+                # avoid issue when lauching every each local search
+                if training_surrogate_every <= 0:
+                    training_surrogate_every = 1
+
+                # check if necessary or not to train again surrogate
+                if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogate <= self.getGlobalEvaluation():
+
+                    # train again surrogate on real evaluated solutions file
+                    start_training = time.time()
+                    self.train_surrogate()
+                    training_time = time.time() - start_training
+
+                    self._surrogate_analyser = SurrogateAnalysisMono(training_time, training_surrogate_every, r_squared, mae, self.getGlobalMaxEvaluation(), self._n_local_search)
+
+                    # reload new surrogate function
+                    self.load_surrogate()
+
+                # increase number of local search done
+                self._n_local_search += 1
+
+                self.information()
+
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
+
+        self.end()
+        return self._bestSolution
+
+    def addCallback(self, callback):
+        """Add new callback to algorithm specifying usefull parameters
+
+        Args:
+            callback: {Callback} -- specific Callback instance
+        """
+        # specify current main algorithm reference
+        if self.getParent() is not None:
+            callback.setAlgo(self.getParent())
+        else:
+            callback.setAlgo(self)
+
+        # set as new
+        self._callbacks.append(callback)

+ 96 - 64
optimization/ILSSurrogate.py

@@ -5,9 +5,10 @@
 import os
 import os
 import logging
 import logging
 import joblib
 import joblib
+import time
 
 
 # module imports
 # module imports
-from macop.algorithms.Algorithm import Algorithm
+from macop.algorithms.base import Algorithm
 from .LSSurrogate import LocalSearchSurrogate
 from .LSSurrogate import LocalSearchSurrogate
 
 
 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
 from sklearn.linear_model import (LinearRegression, Lasso, Lars, LassoLars,
@@ -18,6 +19,7 @@ from wsao.sao.surrogates.walsh import WalshSurrogate
 from wsao.sao.algos.fitter import FitterAlgo
 from wsao.sao.algos.fitter import FitterAlgo
 from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
 from wsao.sao.utils.analysis import SamplerAnalysis, FitterAnalysis, OptimizerAnalysis
 
 
+
 class ILSSurrogate(Algorithm):
 class ILSSurrogate(Algorithm):
     """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
     """Iterated Local Search used to avoid local optima and increave EvE (Exploration vs Exploitation) compromise using surrogate
 
 
@@ -40,34 +42,36 @@ class ILSSurrogate(Algorithm):
         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
         callbacks: {[Callback]} -- list of Callback class implementation to do some instructions every number of evaluations and `load` when initializing algorithm
     """
     """
     def __init__(self,
     def __init__(self,
-                 _initalizer,
-                 _evaluator,
-                 _operators,
-                 _policy,
-                 _validator,
-                 _surrogate_file_path,
-                 _start_train_surrogate,
-                 _ls_train_surrogate,
-                 _solutions_file,
-                 _maximise=True,
-                 _parent=None):
+                 initalizer,
+                 evaluator,
+                 operators,
+                 policy,
+                 validator,
+                 surrogate_file_path,
+                 start_train_surrogate,
+                 ls_train_surrogate,
+                 solutions_file,
+                 maximise=True,
+                 parent=None):
 
 
         # set real evaluator as default
         # set real evaluator as default
-        super().__init__(_initalizer, _evaluator, _operators, _policy,
-                _validator, _maximise, _parent)
+        super().__init__(initalizer, evaluator, operators, policy,
+                validator, maximise, parent)
 
 
-        self.n_local_search = 0
+        self._n_local_search = 0
+        self._main_evaluator = evaluator
 
 
-        self.surrogate_file_path = _surrogate_file_path
-        self.start_train_surrogate = _start_train_surrogate
+        self._surrogate_file_path = surrogate_file_path
+        self._start_train_surrogate = start_train_surrogate
 
 
-        self.surrogate_evaluator = None
+        self._surrogate_evaluator = None
+        self._surrogate_analyser = None
 
 
-        self.ls_train_surrogate = _ls_train_surrogate
-        self.solutions_file = _solutions_file
+        self._ls_train_surrogate = ls_train_surrogate
+        self._solutions_file = solutions_file
 
 
     def train_surrogate(self):
     def train_surrogate(self):
-        """etrain if necessary the whole surrogate fitness approximation function
+        """Retrain if necessary the whole surrogate fitness approximation function
         """
         """
         # Following https://gitlab.com/florianlprt/wsao, we re-train the model
         # Following https://gitlab.com/florianlprt/wsao, we re-train the model
         # ---------------------------------------------------------------------------
         # ---------------------------------------------------------------------------
@@ -78,19 +82,26 @@ class ILSSurrogate(Algorithm):
         #        sample=1000,step=10 \
         #        sample=1000,step=10 \
         #        analysis=fitter,logfile=out_fit.csv
         #        analysis=fitter,logfile=out_fit.csv
 
 
-        problem = ND3DProblem(size=len(self.bestSolution.data)) # problem size based on best solution size (need to improve...)
+        problem = ND3DProblem(size=len(self._bestSolution._data)) # problem size based on best solution size (need to improve...)
         model = Lasso(alpha=1e-5)
         model = Lasso(alpha=1e-5)
-        surrogate = WalshSurrogate(order=3, size=problem.size, model=model)
+        surrogate = WalshSurrogate(order=2, size=problem.size, model=model)
         analysis = FitterAnalysis(logfile="train_surrogate.log", problem=problem)
         analysis = FitterAnalysis(logfile="train_surrogate.log", problem=problem)
-
         algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
         algo = FitterAlgo(problem=problem, surrogate=surrogate, analysis=analysis, seed=problem.seed)
 
 
+        # dynamic number of samples based on dataset real evaluations
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        training_samples = int(0.7 * nsamples) # 70% used for learning part at each iteration
+        
         print("Start fitting again the surrogate model")
         print("Start fitting again the surrogate model")
+        print(f'Using {training_samples} of {nsamples} samples for train dataset')
         for r in range(10):
         for r in range(10):
-            print("Iteration n°{0}: for fitting surrogate".format(r))
-            algo.run(samplefile=self.solutions_file, sample=100, step=10)
+            print(f"Iteration n°{r}: for fitting surrogate")
+            algo.run(samplefile=self._solutions_file, sample=training_samples, step=10)
 
 
-        joblib.dump(algo, self.surrogate_file_path)
+        joblib.dump(algo, self._surrogate_file_path)
 
 
 
 
     def load_surrogate(self):
     def load_surrogate(self):
@@ -98,47 +109,47 @@ class ILSSurrogate(Algorithm):
         """
         """
 
 
         # need to first train surrogate if not exist
         # need to first train surrogate if not exist
-        if not os.path.exists(self.surrogate_file_path):
+        if not os.path.exists(self._surrogate_file_path):
             self.train_surrogate()
             self.train_surrogate()
 
 
-        self.surrogate = joblib.load(self.surrogate_file_path)
+        self._surrogate = joblib.load(self._surrogate_file_path)
 
 
         # update evaluator function
         # update evaluator function
-        self.surrogate_evaluator = lambda s: self.surrogate.surrogate.predict([s.data])[0]
+        self._surrogate_evaluator = lambda s: self._surrogate.surrogate.predict([s._data])[0]
 
 
     def add_to_surrogate(self, solution):
     def add_to_surrogate(self, solution):
 
 
         # save real evaluated solution into specific file for surrogate
         # save real evaluated solution into specific file for surrogate
-        with open(self.solutions_file, 'a') as f:
+        with open(self._solutions_file, 'a') as f:
 
 
             line = ""
             line = ""
 
 
-            for index, e in enumerate(solution.data):
+            for index, e in enumerate(solution._data):
 
 
                 line += str(e)
                 line += str(e)
                 
                 
-                if index < len(solution.data) - 1:
+                if index < len(solution._data) - 1:
                     line += ","
                     line += ","
 
 
             line += ";"
             line += ";"
-            line += str(solution.score)
+            line += str(solution._score)
 
 
             f.write(line + "\n")
             f.write(line + "\n")
 
 
-    def run(self, _evaluations, _ls_evaluations=100):
+    def run(self, evaluations, ls_evaluations=100):
         """
         """
         Run the iterated local search algorithm using local search (EvE compromise)
         Run the iterated local search algorithm using local search (EvE compromise)
 
 
         Args:
         Args:
-            _evaluations: {int} -- number of global evaluations for ILS
-            _ls_evaluations: {int} -- number of Local search evaluations (default: 100)
+            evaluations: {int} -- number of global evaluations for ILS
+            ls_evaluations: {int} -- number of Local search evaluations (default: 100)
 
 
         Returns:
         Returns:
             {Solution} -- best solution found
             {Solution} -- best solution found
         """
         """
 
 
         # by default use of mother method to initialize variables
         # by default use of mother method to initialize variables
-        super().run(_evaluations)
+        super().run(evaluations)
 
 
         # initialize current solution
         # initialize current solution
         self.initRun()
         self.initRun()
@@ -146,13 +157,22 @@ class ILSSurrogate(Algorithm):
         # enable resuming for ILS
         # enable resuming for ILS
         self.resume()
         self.resume()
 
 
-        if self.start_train_surrogate > self.getGlobalEvaluation():
+        # count number of surrogate obtained and restart using real evaluations done
+        nsamples = None
+        with open(self._solutions_file, 'r') as f:
+            nsamples = len(f.readlines()) - 1 # avoid header
+
+        if self.getGlobalEvaluation() < nsamples:
+            print(f'Restart using {nsamples} of {self._start_train_surrogate} real evaluations obtained')
+            self._numberOfEvaluations = nsamples
+
+        if self._start_train_surrogate > self.getGlobalEvaluation():
         
         
             # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
             # get `self.start_train_surrogate` number of real evaluations and save it into surrogate dataset file
             # using randomly generated solutions (in order to cover seearch space)
             # using randomly generated solutions (in order to cover seearch space)
-            while self.start_train_surrogate > self.getGlobalEvaluation():
+            while self._start_train_surrogate > self.getGlobalEvaluation():
                 
                 
-                newSolution = self.initializer()
+                newSolution = self.initialiser()
 
 
                 # evaluate new solution
                 # evaluate new solution
                 newSolution.evaluate(self.evaluator)
                 newSolution.evaluate(self.evaluator)
@@ -168,78 +188,90 @@ class ILSSurrogate(Algorithm):
 
 
         # local search algorithm implementation
         # local search algorithm implementation
         while not self.stop():
         while not self.stop():
-            
+
             # set current evaluator based on used or not of surrogate function
             # set current evaluator based on used or not of surrogate function
-            current_evaluator = self.surrogate_evaluator if self.start_train_surrogate <= self.getGlobalEvaluation() else self.evaluator
+            self.evaluator = self._surrogate_evaluator if self._start_train_surrogate <= self.getGlobalEvaluation() else self._main_evaluator
 
 
             # create new local search instance
             # create new local search instance
             # passing global evaluation param from ILS
             # passing global evaluation param from ILS
-            ls = LocalSearchSurrogate(self.initializer,
-                         current_evaluator,
-                         self.operators,
+            ls = LocalSearchSurrogate(self.initialiser,
+                         self.evaluator,
+                         self._operators,
                          self.policy,
                          self.policy,
                          self.validator,
                          self.validator,
-                         self.maximise,
-                         _parent=self)
+                         self._maximise,
+                         parent=self)
 
 
             # add same callbacks
             # add same callbacks
-            for callback in self.callbacks:
+            for callback in self._callbacks:
                 ls.addCallback(callback)
                 ls.addCallback(callback)
 
 
             # create and search solution from local search
             # create and search solution from local search
-            newSolution = ls.run(_ls_evaluations)
+            newSolution = ls.run(ls_evaluations)
 
 
             # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
             # if better solution than currently, replace it (solution saved in training pool, only if surrogate process is in a second process step)
             # Update : always add new solution into surrogate pool, not only if solution is better
             # Update : always add new solution into surrogate pool, not only if solution is better
             #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
             #if self.isBetter(newSolution) and self.start_train_surrogate < self.getGlobalEvaluation():
-            if self.start_train_surrogate <= self.getGlobalEvaluation():
+            if self._start_train_surrogate <= self.getGlobalEvaluation():
 
 
                 # if better solution found from local search, retrained the found solution and test again
                 # if better solution found from local search, retrained the found solution and test again
                 # without use of surrogate
                 # without use of surrogate
-                fitness_score = self.evaluator(newSolution)
+                fitness_score = self._main_evaluator(newSolution)
                 # self.increaseEvaluation() # dot not add evaluation
                 # self.increaseEvaluation() # dot not add evaluation
 
 
                 newSolution.score = fitness_score
                 newSolution.score = fitness_score
 
 
                 # if solution is really better after real evaluation, then we replace
                 # if solution is really better after real evaluation, then we replace
                 if self.isBetter(newSolution):
                 if self.isBetter(newSolution):
-                    self.bestSolution = newSolution
+                    self.result = newSolution
 
 
                 self.add_to_surrogate(newSolution)
                 self.add_to_surrogate(newSolution)
 
 
                 self.progress()
                 self.progress()
 
 
+            # check using specific dynamic criteria based on r^2
+            r_squared = self._surrogate.analysis.coefficient_of_determination(self._surrogate.surrogate)
+            training_surrogate_every = int(r_squared * self._ls_train_surrogate)
+            print(f"=> R^2 of surrogate is of {r_squared}. Retraining model every {training_surrogate_every} LS")
+
+            # avoid issue when lauching every each local search
+            if training_surrogate_every <= 0:
+                training_surrogate_every = 1
+
             # check if necessary or not to train again surrogate
             # check if necessary or not to train again surrogate
-            if self.n_local_search % self.ls_train_surrogate == 0 and self.start_train_surrogate <= self.getGlobalEvaluation():
+            if self._n_local_search % training_surrogate_every == 0 and self._start_train_surrogate <= self.getGlobalEvaluation():
 
 
                 # train again surrogate on real evaluated solutions file
                 # train again surrogate on real evaluated solutions file
+                start_training = time.time()
                 self.train_surrogate()
                 self.train_surrogate()
+                training_time = time.time() - start_training
+
+                self._surrogate_analyser = SurrogateAnalysis(training_time, training_surrogate_every, r_squared, self.getGlobalMaxEvaluation(), self._n_local_search)
 
 
                 # reload new surrogate function
                 # reload new surrogate function
                 self.load_surrogate()
                 self.load_surrogate()
 
 
             # increase number of local search done
             # increase number of local search done
-            self.n_local_search += 1
+            self._n_local_search += 1
 
 
             self.information()
             self.information()
 
 
-        logging.info("End of %s, best solution found %s" %
-                     (type(self).__name__, self.bestSolution))
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
 
 
         self.end()
         self.end()
-        return self.bestSolution
+        return self._bestSolution
 
 
-    def addCallback(self, _callback):
+    def addCallback(self, callback):
         """Add new callback to algorithm specifying usefull parameters
         """Add new callback to algorithm specifying usefull parameters
 
 
         Args:
         Args:
-            _callback: {Callback} -- specific Callback instance
+            callback: {Callback} -- specific Callback instance
         """
         """
         # specify current main algorithm reference
         # specify current main algorithm reference
-        if self.parent is not None:
-            _callback.setAlgo(self.parent)
+        if self.getParent() is not None:
+            callback.setAlgo(self.getParent())
         else:
         else:
-            _callback.setAlgo(self)
+            callback.setAlgo(self)
 
 
         # set as new
         # set as new
-        self.callbacks.append(_callback)
+        self._callbacks.append(callback)

+ 18 - 15
optimization/LSSurrogate.py

@@ -5,7 +5,7 @@
 import logging
 import logging
 
 
 # module imports
 # module imports
-from macop.algorithms.Algorithm import Algorithm
+from macop.algorithms.base import Algorithm
 
 
 
 
 class LocalSearchSurrogate(Algorithm):
 class LocalSearchSurrogate(Algorithm):
@@ -41,9 +41,12 @@ class LocalSearchSurrogate(Algorithm):
         #     self.bestSolution = self.parent.bestSolution
         #     self.bestSolution = self.parent.bestSolution
 
 
         # initialize current solution
         # initialize current solution
-        self.initRun()
+        # self.initRun()
 
 
-        solutionSize = self.currentSolution.size
+        for callback in self._callbacks:
+            callback.load()
+
+        solutionSize = self._currentSolution.size
 
 
         # local search algorithm implementation
         # local search algorithm implementation
         while not self.stop():
         while not self.stop():
@@ -51,19 +54,20 @@ class LocalSearchSurrogate(Algorithm):
             for _ in range(solutionSize):
             for _ in range(solutionSize):
 
 
                 # update current solution using policy
                 # update current solution using policy
-                newSolution = self.update(self.currentSolution)
+                newSolution = self.update(self._currentSolution)
 
 
                 # if better solution than currently, replace it
                 # if better solution than currently, replace it
                 if self.isBetter(newSolution):
                 if self.isBetter(newSolution):
-                    self.bestSolution = newSolution
+                    self._bestSolution = newSolution
 
 
                 # increase number of evaluations
                 # increase number of evaluations
                 self.increaseEvaluation()
                 self.increaseEvaluation()
 
 
-                self.progress()
+                # self.progress()
+                for callback in self._callbacks:
+                    callback.run()
 
 
-                logging.info("---- Current %s - SCORE %s" %
-                             (newSolution, newSolution.fitness()))
+                logging.info(f"---- Current {newSolution} - SCORE {newSolution.fitness}")
 
 
                 # add to surrogate pool file if necessary (using ILS parent reference)
                 # add to surrogate pool file if necessary (using ILS parent reference)
                 # if self.parent.start_train_surrogate >= self.getGlobalEvaluation():
                 # if self.parent.start_train_surrogate >= self.getGlobalEvaluation():
@@ -74,12 +78,11 @@ class LocalSearchSurrogate(Algorithm):
                     break
                     break
 
 
             # after applying local search on currentSolution, we switch into new local area using known current bestSolution
             # after applying local search on currentSolution, we switch into new local area using known current bestSolution
-            self.currentSolution = self.bestSolution
+            self._currentSolution = self._bestSolution
 
 
-        logging.info("End of %s, best solution found %s" %
-                     (type(self).__name__, self.bestSolution))
+        logging.info(f"End of {type(self).__name__}, best solution found {self._bestSolution}")
 
 
-        return self.bestSolution
+        return self._bestSolution
 
 
     def addCallback(self, callback):
     def addCallback(self, callback):
         """Add new callback to algorithm specifying usefull parameters
         """Add new callback to algorithm specifying usefull parameters
@@ -88,10 +91,10 @@ class LocalSearchSurrogate(Algorithm):
             callback: {Callback} -- specific Callback instance
             callback: {Callback} -- specific Callback instance
         """
         """
         # specify current main algorithm reference
         # specify current main algorithm reference
-        if self.parent is not None:
-            callback.setAlgo(self.parent)
+        if self._parent is not None:
+            callback.setAlgo(self._parent)
         else:
         else:
             callback.setAlgo(self)
             callback.setAlgo(self)
 
 
         # set as new
         # set as new
-        self.callbacks.append(callback)
+        self._callbacks.append(callback)

+ 121 - 0
optimization/callbacks/MultiPopCheckpoint.py

@@ -0,0 +1,121 @@
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.base import Callback
+from macop.utils.progress import macop_text, macop_line
+
+
+class MultiPopCheckpoint(Callback):
+    """
+    MultiCheckpoint is used for loading previous computations and start again after loading checkpoint
+
+    Attributes:
+        algo: {:class:`~macop.algorithms.base.Algorithm`} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current population
+        population = self._algo.population
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info("Checkpoint is done into " + self._filepath)
+
+            with open(self._filepath, 'a') as f:
+                
+                pop_line = str(currentEvaluation) + ';'
+
+                scores = []
+                pop_data = []
+
+                for solution in population:
+                    solution_data = ""
+                    solutionSize = len(solution.data)
+
+                    for index, val in enumerate(solution.data):
+                        solution_data += str(val)
+
+                        if index < solutionSize - 1:
+                            solution_data += ' '
+                    
+                    scores.append(solution.fitness)
+                    pop_data.append(solution_data)
+
+                for score in scores:
+                    pop_line += str(score) + ';'
+
+                for data in pop_data:
+                    pop_line += data + ';'
+
+                pop_line += '\n'
+
+                f.write(pop_line)
+
+    def load(self):
+        """
+        Load backup lines as population and set algorithm state (population and pareto front) at this backup
+        """
+        if os.path.exists(self._filepath):
+
+            logging.info('Load best solution from last checkpoint')
+            with open(self._filepath, 'r') as f:
+
+                # read data for each line
+                data_line = f.readlines()[-1]
+                
+                data = data_line.replace(';\n', '').split(';')
+          
+                # get evaluation  information
+                globalEvaluation = int(data[0])
+
+                if self._algo.getParent() is not None:
+                    self._algo.getParent(
+                    )._numberOfEvaluations = globalEvaluation
+                else:
+                    self._algo._numberOfEvaluations = globalEvaluation
+
+                nSolutions = len(self._algo.population)
+                scores = list(map(float, data[1:nSolutions + 1]))
+
+                # get best solution data information
+                pop_str_data = data[nSolutions + 1:]
+                pop_data = []
+
+                for sol_data in pop_str_data:
+                    current_data = list(map(int, sol_data.split(' ')))
+                    pop_data.append(current_data)
+
+                for i, sol_data in enumerate(pop_data):
+
+                    # initialise and fill with data
+                    self._algo.population[i] = self._algo.initialiser()
+                    self._algo.population[i].data = np.array(sol_data)
+                    self._algo.population[i].fitness = scores[i]
+
+            macop_line(self._algo)
+            macop_text(
+                self._algo,
+                f'Load of available population from `{self._filepath}`')
+            macop_text(
+                self._algo,
+                f'Restart algorithm from evaluation {self._algo._numberOfEvaluations}.'
+            )
+        else:
+            macop_text(
+                self._algo,
+                'No backup found... Start running algorithm from evaluation 0.'
+            )
+            logging.info(
+                "Can't load backup... Backup filepath not valid in Checkpoint")
+
+        macop_line(self._algo)

+ 96 - 0
optimization/callbacks/MultiSurrogateCheckpoint.py

@@ -0,0 +1,96 @@
+"""Basic Checkpoint class implementation
+"""
+
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.Callback import Callback
+from macop.utils.color import macop_text, macop_line
+
+
+class MultiSurrogateCheckpoint(Callback):
+    """
+    MultiSurrogateCheckpoint is used for keep track of sub-surrogate problem indices
+
+    Attributes:
+        algo: {Algorithm} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current best solution
+        k_indices = self._algo._k_indices
+
+        # Do nothing is surrogate analyser does not exist
+        if k_indices is None:
+            return
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info(f"Multi surrogate analysis checkpoint is done into {self._filepath}")
+
+            line = str(currentEvaluation) + ';'
+
+            for indices in k_indices:
+                
+                indices_data = ""
+                indices_size = len(indices)
+
+                for index, val in enumerate(indices):
+                    indices_data += str(val)
+
+                    if index < indices_size - 1:
+                        indices_data += ' '
+
+                line += indices_data + ';'
+
+            line += '\n'
+
+            # check if file exists
+            if not os.path.exists(self._filepath):
+                with open(self._filepath, 'w') as f:
+                    f.write(line)
+            else:
+                with open(self._filepath, 'a') as f:
+                    f.write(line)
+
+    def load(self):
+        """
+        Load nothing there, as we only log surrogate training information
+        """
+        if os.path.exists(self._filepath):
+
+            logging.info('Load best solution from last checkpoint')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1].replace(';\n', '')
+                data = lastline.split(';')
+
+                k_indices = data[1:]
+                k_indices_final = []
+
+                for indices in k_indices:
+                    k_indices_final.append(list(map(int, indices.split(' '))))
+
+                # set k_indices into main algorithm
+                self._algo._k_indices = k_indices_final
+
+            print(macop_line())
+            print(macop_text(f' MultiSurrogateCheckpoint found from `{self._filepath}` file.'))
+
+        else:
+            print(macop_text('No backup found... Start running using new `k_indices` values'))
+            logging.info("Can't load MultiSurrogate backup... Backup filepath not valid in  MultiSurrogateCheckpoint")
+
+        print(macop_line())
+

+ 92 - 0
optimization/callbacks/MultiSurrogateSpecificCheckpoint.py

@@ -0,0 +1,92 @@
+"""Basic Checkpoint class implementation
+"""
+
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.Callback import Callback
+from macop.utils.color import macop_text, macop_line
+
+
+class MultiSurrogateSpecificCheckpoint(Callback):
+    """
+    MultiSurrogateSpecificCheckpoint is used for keep track of sub-surrogate problem indices
+
+    Attributes:
+        algo: {Algorithm} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current best solution
+        population = self._algo._population
+
+        # Do nothing is surrogate analyser does not exist
+        if population is None:
+            return
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info(f"Multi surrogate specific analysis checkpoint is done into {self._filepath}")
+
+            line = ''
+
+            fitness_list = [ s.fitness for s in population ]
+            fitness_data = ' '.join(list(map(str, fitness_list)))
+
+            for s in population:
+                s_data = ' '.join(list(map(str, s._data)))
+                line += s_data + ';'
+
+            line += fitness_data
+
+            line += '\n'
+
+            # check if file exists
+            if not os.path.exists(self._filepath):
+                with open(self._filepath, 'w') as f:
+                    f.write(line)
+            else:
+                with open(self._filepath, 'a') as f:
+                    f.write(line)
+
+    def load(self):
+        """
+        Load previous population
+        """
+        if os.path.exists(self._filepath):
+
+            logging.info('Load population solutions from last checkpoint')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1].replace('\n', '')
+                data = lastline.split(';')
+
+                fitness_scores = list(map(float, data[-1].split(' ')))
+
+                for i, solution_data in enumerate(data[:-1]):
+                    self._algo._population[i]._data = list(map(int, solution_data.split(' ')))
+                    self._algo._population[i]._score = fitness_scores[i]
+
+            print(macop_line())
+            print(macop_text(f' MultiSurrogateSpecificCheckpoint found from `{self._filepath}` file. Start running using previous `population` values'))
+
+            for i, s in enumerate(self._algo._population):
+                print(f'Population[{i}]: best solution fitness is {s.fitness}')
+
+        else:
+            print(macop_text('No backup found... Start running using new `population` values'))
+            logging.info("Can't load MultiSurrogateSpecific backup... Backup filepath not valid in  MultiSurrogateCheckpoint")
+
+        print(macop_line())
+

+ 93 - 0
optimization/callbacks/SurrogateCheckpoint.py

@@ -0,0 +1,93 @@
+"""Basic Checkpoint class implementation
+"""
+
+# main imports
+import os
+import logging
+import numpy as np
+
+# module imports
+from macop.callbacks.Callback import Callback
+from macop.utils.color import macop_text, macop_line
+
+
+class SurrogateCheckpoint(Callback):
+    """
+    SurrogateCheckpoint is used for logging training data information about surrogate
+
+    Attributes:
+        algo: {Algorithm} -- main algorithm instance reference
+        every: {int} -- checkpoint frequency used (based on number of evaluations)
+        filepath: {str} -- file path where checkpoints will be saved
+    """
+    def run(self):
+        """
+        Check if necessary to do backup based on `every` variable
+        """
+        # get current best solution
+        solution = self._algo._bestSolution
+        surrogate_analyser = self._algo._surrogate_analyser
+
+        # Do nothing is surrogate analyser does not exist
+        if surrogate_analyser is None:
+            return
+
+        currentEvaluation = self._algo.getGlobalEvaluation()
+
+        # backup if necessary
+        if currentEvaluation % self._every == 0:
+
+            logging.info(f"Surrogate analysis checkpoint is done into {self._filepath}")
+
+            solutionData = ""
+            solutionSize = len(solution._data)
+
+            for index, val in enumerate(solution._data):
+                solutionData += str(val)
+
+                if index < solutionSize - 1:
+                    solutionData += ' '
+
+            # get score of r² and mae
+            r2_data = ' '.join(list(map(str, surrogate_analyser._r2_scores)))
+            mae_data = ' '.join(list(map(str, surrogate_analyser._mae_scores)))
+
+            line = str(currentEvaluation) + ';' + str(surrogate_analyser._n_local_search) + ';' + str(surrogate_analyser._every_ls) + ';' + str(surrogate_analyser._time) + ';' + r2_data + ';' + str(surrogate_analyser._r2) \
+                + ';' + mae_data + ';' + str(surrogate_analyser._mae) \
+                + ';' + solutionData + ';' + str(solution.fitness) + ';\n'
+
+            # check if file exists
+            if not os.path.exists(self._filepath):
+                with open(self._filepath, 'w') as f:
+                    f.write(line)
+            else:
+                with open(self._filepath, 'a') as f:
+                    f.write(line)
+
+    def load(self):
+        """
+        only load global n local search
+        """
+
+        if os.path.exists(self._filepath):
+
+            logging.info('Load n local search')
+            with open(self._filepath) as f:
+
+                # get last line and read data
+                lastline = f.readlines()[-1].replace(';\n', '')
+                data = lastline.split(';')
+
+                n_local_search = int(data[1])
+
+                # set k_indices into main algorithm
+                self._algo._total_n_local_search = n_local_search
+
+            print(macop_line())
+            print(macop_text(f'SurrogateCheckpoint found from `{self._filepath}` file.'))
+
+        else:
+            print(macop_text('No backup found...'))
+            logging.info("Can't load Surrogate backup... Backup filepath not valid in SurrogateCheckpoint")
+
+        print(macop_line())

+ 107 - 0
optimization/operators/SimplePopCrossover.py

@@ -0,0 +1,107 @@
+from macop.operators.base import Crossover
+import random
+
+class SimplePopCrossover(Crossover):
+
+    def apply(self, solution1, solution2=None):
+        """Create new solution based on best solution found and solution passed as parameter
+
+        Args:
+            solution1: {:class:`~macop.solutions.base.Solution`} -- the first solution to use for generating new solution
+            solution2: {:class:`~macop.solutions.base.Solution`} -- the second solution to use for generating new solution (using population)
+
+        Returns:
+            {:class:`~macop.solutions.base.Solution`}: new generated solution
+        """
+
+        size = solution1._size
+
+        # copy data of solution
+        firstData = solution1.data.copy()
+
+        population = self._algo.population if self._algo.population is not None else self._algo.getParent().population
+
+        # copy of solution2 as output solution
+        valid = False
+        copy_solution = None
+
+        # use of different random population solution
+        ncounter = 0
+
+        while not valid:
+
+            chosen_solution = population[random.randint(0, len(population) - 1)]
+            
+            if not list(chosen_solution.data) == list(firstData) or ncounter > 10:
+                valid = True
+                copy_solution = chosen_solution.clone()
+
+            # add security
+            ncounter += 1
+
+        # default empty solution
+        if copy_solution is None:
+            copy_solution = self._algo.initialiser()
+
+        # random split index
+        splitIndex = int(size / 2)
+
+        if random.uniform(0, 1) > 0.5:
+            copy_solution.data[splitIndex:] = firstData[splitIndex:]
+        else:
+            copy_solution.data[:splitIndex] = firstData[:splitIndex]
+
+        return copy_solution
+
+
+class RandomPopCrossover(Crossover):
+
+    def apply(self, solution1, solution2=None):
+        """Create new solution based on best solution found and solution passed as parameter
+
+        Args:
+            solution1: {:class:`~macop.solutions.base.Solution`} -- the first solution to use for generating new solution
+            solution2: {:class:`~macop.solutions.base.Solution`} -- the second solution to use for generating new solution (using population)
+
+        Returns:
+            {:class:`~macop.solutions.base.Solution`}: new generated solution
+        """
+
+        size = solution1._size
+
+        # copy data of solution
+        firstData = solution1.data.copy()
+
+        population = self._algo.population if self._algo.population is not None else self._algo.getParent().population
+
+        # copy of solution2 as output solution
+        valid = False
+        copy_solution = None
+
+        # use of different random population solution
+        ncounter = 0
+
+        while not valid:
+
+            chosen_solution = population[random.randint(0, len(population) - 1)]
+            
+            if not list(chosen_solution.data) == list(firstData) or ncounter > 10:
+                valid = True
+                copy_solution = chosen_solution.clone()
+
+            # add security
+            ncounter += 1
+
+        # default empty solution
+        if copy_solution is None:
+            copy_solution = self._algo.initialiser()
+
+        # random split index
+        splitIndex = random.randint(0, len(population) - 1)
+
+        if random.uniform(0, 1) > 0.5:
+            copy_solution.data[splitIndex:] = firstData[splitIndex:]
+        else:
+            copy_solution.data[:splitIndex] = firstData[:splitIndex]
+
+        return copy_solution

+ 24 - 0
optimization/utils/SurrogateAnalysis.py

@@ -0,0 +1,24 @@
+# quick object for surrogate logging data
+class SurrogateAnalysisMono():
+
+    def __init__(self, time, every_ls, r2, mae, evaluations, n_local_search):
+        self._time = time
+        self._every_ls = every_ls
+        self._r2 = r2
+        self._mae = mae
+        self._evaluations = evaluations
+        self._n_local_search = n_local_search
+
+
+class SurrogateAnalysisMulti():
+
+    def __init__(self, time, every_ls, r2_scores, r2, mae_scores, mae, evaluations, n_local_search):
+        self._time = time
+        self._every_ls = every_ls
+        self._r2_scores = r2_scores
+        self._r2 = r2
+        self._mae_scores = mae_scores
+        self._mae = mae
+        self._evaluations = evaluations
+        self._n_local_search = n_local_search
+

+ 1 - 1
rnn

@@ -1 +1 @@
-Subproject commit c4acf38ab3816725faa0bb84c68cb18fdd5ebb32
+Subproject commit 8c2fc8888c190be9829e3f3a4d5d320014b5b96f

+ 1 - 1
run_openML_surrogate.py

@@ -14,7 +14,7 @@ def main():
     p_ils = args.ils
     p_ils = args.ils
     p_ls  = args.ls
     p_ls  = args.ls
 
 
-    open_ml_problems = os.listdir(open_ml_problems_folder)
+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
 
 
     for ml_problem in open_ml_problems:
     for ml_problem in open_ml_problems:
 
 

+ 102 - 0
run_openML_surrogate_multi.py

@@ -0,0 +1,102 @@
+import os, argparse
+import shutil
+
+open_ml_problems_folder = 'OpenML_datasets'
+surrogate_data_path = 'data/surrogate/data/'
+
+k_params = [100, 150, 200]
+k_random = [0, 1]
+k_reinit = [0, 1]
+every_ls = 50
+
+n_times = 5
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Find best features for each OpenML problems")
+
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+
+    args = parser.parse_args()
+
+    p_ils = args.ils
+    p_ls  = args.ls
+
+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
+
+    for ml_problem in open_ml_problems:
+
+        # for each problem prepare specific pre-computed real solution file
+        ml_problem_name = ml_problem.replace('.csv', '')
+        ml_problem_path = os.path.join(open_ml_problems_folder, ml_problem)
+
+        ml_surrogate_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                               f"--data {ml_problem_path} " \
+                               f"--ils {p_ils} " \
+                               f"--ls {p_ls} " \
+                               f"--output {ml_problem_name} " \
+                               f"--generate_only 1"
+        print(f'Running extraction real evaluations data for {ml_problem_name}')
+        os.system(ml_surrogate_command)
+
+        real_evaluation_data_file_path = os.path.join(surrogate_data_path, ml_problem_name)
+
+        # for each multi param:
+        # - copy precomputed real_evaluation_data_file
+        # - run new instance using specific data
+        for k in k_params:
+            for k_r in k_random:
+                for k_init in k_reinit:
+
+                    # if not use of k_reinit and use of random, then run multiple times this instance to do mean later
+                    if k_init == 0 and k_r == 1:
+
+                        for i in range(n_times):
+
+                            str_index = str(i)
+
+                            while len(str_index) < 3:
+                                str_index = "0" + str_index
+
+                            output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}_{str_index}'
+
+                            # copy pre-computed real evaluation data for this instance
+                            current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                            shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                            ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                                            f"--data {ml_problem_path} " \
+                                            f"--ils {p_ils} " \
+                                            f"--ls {p_ls} " \
+                                            f"--every_ls {every_ls} " \
+                                            f"--k_division {k} " \
+                                            f"--k_random {k_r} " \
+                                            f"--k_dynamic {k_init} " \
+                                            f"--output {output_problem_name}"
+                            print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}, i: {i}]')
+                            os.system(ml_surrogate_multi_command)
+
+                    else:
+                        output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}'
+
+                        # copy pre-computed real evaluation data for this instance
+                        current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                        shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                        ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi.py " \
+                                        f"--data {ml_problem_path} " \
+                                        f"--ils {p_ils} " \
+                                        f"--ls {p_ls} " \
+                                        f"--every_ls {every_ls} " \
+                                        f"--k_division {k} " \
+                                        f"--k_random {k_r} " \
+                                        f"--k_dynamic {k_init} " \
+                                        f"--output {output_problem_name}"
+                        print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, k_reinit: {k_init}]')
+                        os.system(ml_surrogate_multi_command)
+
+
+
+if __name__ == "__main__":
+    main()

+ 104 - 0
run_openML_surrogate_multi_specific.py

@@ -0,0 +1,104 @@
+import os, argparse
+import shutil
+
+open_ml_problems_folder = 'OpenML_datasets'
+surrogate_data_path = 'data/surrogate/data/'
+
+# fixed test params as first part
+k_params = [30, 50, 100] # 100, 150, 200
+k_random = [0] # 0, 1
+k_reinit = [0] # 0, 1
+every_ls = 5
+
+n_times = 5
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Find best features for each OpenML problems")
+
+    parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
+    parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
+
+    args = parser.parse_args()
+
+    p_ils = args.ils
+    p_ls  = args.ls
+
+    open_ml_problems = sorted(os.listdir(open_ml_problems_folder))
+
+    for ml_problem in open_ml_problems:
+
+        # for each problem prepare specific pre-computed real solution file
+        ml_problem_name = ml_problem.replace('.csv', '')
+        ml_problem_path = os.path.join(open_ml_problems_folder, ml_problem)
+
+        # ml_surrogate_command = f"python find_best_attributes_surrogate_openML_multi_specific.py " \
+        #                        f"--data {ml_problem_path} " \
+        #                        f"--ils {p_ils} " \
+        #                        f"--ls {p_ls} " \
+        #                        f"--output {ml_problem_name} " \
+        #                        f"--generate_only 1"
+        # print(f'Running extraction real evaluations data for {ml_problem_name}')
+        # os.system(ml_surrogate_command)
+
+        # real_evaluation_data_file_path = os.path.join(surrogate_data_path, ml_problem_name)
+
+        # for each multi param:
+        # - copy precomputed real_evaluation_data_file
+        # - run new instance using specific data
+        for k in k_params:
+            for k_r in k_random:
+                for k_init in k_reinit:
+
+                    # if not use of k_reinit and use of random, then run multiple times this instance to do mean later
+                    if k_init == 0 and k_r == 1:
+                        for i in range(n_times):
+
+                            str_index = str(i)
+
+                            while len(str_index) < 3:
+                                str_index = "0" + str_index
+
+                            output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}_{str_index}'
+
+                            # copy pre-computed real evaluation data for this instance
+                            # current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                            # shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                            ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi_specific.py " \
+                                            f"--data {ml_problem_path} " \
+                                            f"--ils {p_ils} " \
+                                            f"--ls {p_ls} " \
+                                            f"--every_ls {every_ls} " \
+                                            f"--k_division {k} " \
+                                            f"--k_random {k_r} " \
+                                            f"--k_dynamic {k_init} " \
+                                            f"--output {output_problem_name}"
+                                            
+                            print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}, i: {i}]')
+                            os.system(ml_surrogate_multi_command)
+
+                    else:
+                        output_problem_name = f'{ml_problem_name}_everyLS_{every_ls}_k{k}_random{k_r}_reinit{k_init}'
+
+                        # copy pre-computed real evaluation data for this instance
+                        # current_output_real_eval_path = os.path.join(surrogate_data_path, output_problem_name)
+                        # shutil.copy2(real_evaluation_data_file_path, current_output_real_eval_path)
+
+                        ml_surrogate_multi_command = f"python find_best_attributes_surrogate_openML_multi_specific.py " \
+                                        f"--data {ml_problem_path} " \
+                                        f"--ils {p_ils} " \
+                                        f"--ls {p_ls} " \
+                                        f"--every_ls {every_ls} " \
+                                        f"--k_division {k} " \
+                                        f"--k_random {k_r} " \
+                                        f"--k_dynamic {k_init} " \
+                                        f"--output {output_problem_name}"
+                                        
+                        print(f'Running extraction data for {ml_problem_name} with [ils: {p_ils}, ls: {p_ls}, k: {k}, k_r: {k_r}]')
+                        os.system(ml_surrogate_multi_command)
+
+
+
+if __name__ == "__main__":
+    main()

+ 28 - 0
run_surrogate_rendering.sh

@@ -0,0 +1,28 @@
+#! /bin/bash
+
+# default param
+ILS=100000
+LS=100
+SS=50
+LENGTH=30
+POP=100
+ORDER=2
+TRAIN_EVERY=20
+
+
+output="rendering-attributes-ILS_${ILS}-POP_${POP}-LS_${LS}-SS_${SS}-SO_${ORDER}-SE_${TRAIN_EVERY}"
+DATASET="rnn/data/datasets/features-selection-rendering-scaled/features-selection-rendering-scaled"
+
+for POP in {20,60,100};
+do
+    for ORDER in {2,3};
+    do
+        for LS in {100,500,1000};
+        do
+            output="rendering-attributes-ILS_${ILS}-POP_${POP}-LS_${LS}-SS_${SS}-SO_${ORDER}-SE_${TRAIN_EVERY}"
+            echo "Run optim attributes using: ${output}"
+            python find_best_attributes_surrogate.py --data ${DATASET} --start_surrogate ${SS} --length 30 --ils ${ILS} --ls ${LS} --pop ${POP} --order ${ORDER} --train_every ${TRAIN_EVERY}  --output ${output}
+        done
+    done
+done
+

+ 1 - 1
wsao

@@ -1 +1 @@
-Subproject commit 875bbdcee600f911958dbc47e319afbb8796f49d
+Subproject commit a92ca5a285c6530498a68db5c08d89d2dfe246ec