123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- # main imports
- import os
- import sys
- import argparse
- import pandas as pd
- import numpy as np
- import logging
- import datetime
- import random
- # model imports
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import GridSearchCV
- from sklearn.linear_model import LogisticRegression
- from sklearn.ensemble import RandomForestClassifier, VotingClassifier
- import joblib
- import sklearn
- import sklearn.svm as svm
- from sklearn.utils import shuffle
- from sklearn.metrics import roc_auc_score
- from sklearn.model_selection import cross_val_score
- from sklearn.preprocessing import MinMaxScaler
- # modules and config imports
- sys.path.insert(0, '') # trick to enable import of main folder module
- import custom_config as cfg
- import models as mdl
- from optimization.ILSMultiSpecificSurrogate import ILSMultiSpecificSurrogate
- from macop.solutions.BinarySolution import BinarySolution
- from macop.operators.mutators.SimpleMutation import SimpleMutation
- from macop.operators.mutators.SimpleBinaryMutation import SimpleBinaryMutation
- from macop.operators.crossovers.SimpleCrossover import SimpleCrossover
- from macop.operators.crossovers.RandomSplitCrossover import RandomSplitCrossover
- from macop.operators.policies.UCBPolicy import UCBPolicy
- from macop.operators.policies.RandomPolicy import RandomPolicy
- from macop.callbacks.BasicCheckpoint import BasicCheckpoint
- from macop.callbacks.UCBCheckpoint import UCBCheckpoint
- from optimization.callbacks.SurrogateCheckpoint import SurrogateCheckpoint
- from optimization.callbacks.MultiSurrogateCheckpoint import MultiSurrogateCheckpoint
- from optimization.callbacks.MultiSurrogateSpecificCheckpoint import MultiSurrogateSpecificCheckpoint
- from sklearn.ensemble import RandomForestClassifier
- # avoid display of warning
- def warn(*args, **kwargs):
- pass
- import warnings
- warnings.filterwarnings("ignore", category=DeprecationWarning)
- warnings.warn = warn
- # default validator
- def validator(solution):
- # at least 5 attributes
- if list(solution._data).count(1) < 2:
- return False
- return True
- def train_model(X_train, y_train):
- #print ('Creating model...')
- # here use of SVM with grid search CV
- Cs = [0.001, 0.01, 0.1, 1, 10, 100]
- gammas = [0.001, 0.01, 0.1,10, 100]
- param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
- svc = svm.SVC(probability=True, class_weight='balanced')
- #clf = GridSearchCV(svc, param_grid, cv=5, verbose=1, scoring=my_accuracy_scorer, n_jobs=-1)
- clf = GridSearchCV(svc, param_grid, cv=4, verbose=0, n_jobs=-1)
- clf.fit(X_train, y_train)
- model = clf.best_estimator_
- return model
- def loadDataset(filename):
- ########################
- # 1. Get and prepare data
- ########################
- dataset = pd.read_csv(filename, sep=',')
- # change label as common
- min_label_value = min(dataset.iloc[:, -1])
- max_label_value = max(dataset.iloc[:, -1])
- dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(min_label_value, 0)
- dataset.iloc[:, -1] = dataset.iloc[:, -1].replace(max_label_value, 1)
- X_dataset = dataset.iloc[:, :-1]
- y_dataset = dataset.iloc[:, -1]
- problem_size = len(X_dataset.columns)
- # min/max normalisation over feature
- # create a scaler object
- scaler = MinMaxScaler()
- # fit and transform the data
- X_dataset = np.array(pd.DataFrame(scaler.fit_transform(X_dataset), columns=X_dataset.columns))
- # prepare train, validation and test datasets
- X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.3, shuffle=True)
- return X_train, y_train, X_test, y_test, problem_size
- def main():
- parser = argparse.ArgumentParser(description="Train and find best filters to use for model")
- parser.add_argument('--data', type=str, help='open ml dataset filename prefix', required=True)
- parser.add_argument('--every_ls', type=int, help='train every ls surrogate model', default=50) # default value
- parser.add_argument('--k_division', type=int, help='number of expected sub surrogate model', default=20)
- parser.add_argument('--k_dynamic', type=int, help='specify if indices for each sub surrogate model are changed or not for each training', default=0, choices=[0, 1])
- parser.add_argument('--k_random', type=int, help='specify if split is random or not', default=1, choices=[0, 1])
- parser.add_argument('--ils', type=int, help='number of total iteration for ils algorithm', required=True)
- parser.add_argument('--ls', type=int, help='number of iteration for Local Search algorithm', required=True)
- parser.add_argument('--generate_only', type=int, help='number of iteration for Local Search algorithm', default=0, choices=[0, 1])
- parser.add_argument('--output', type=str, help='output surrogate model name')
- args = parser.parse_args()
- p_data_file = args.data
- p_every_ls = args.every_ls
- p_k_division = args.k_division
- p_k_dynamic = bool(args.k_dynamic)
- p_k_random = bool(args.k_random)
- p_ils_iteration = args.ils
- p_ls_iteration = args.ls
- p_generate_only = bool(args.generate_only)
- p_output = args.output
- # load data from file and get problem size
- X_train, y_train, X_test, y_test, problem_size = loadDataset(p_data_file)
- # create `logs` folder if necessary
- if not os.path.exists(cfg.output_logs_folder):
- os.makedirs(cfg.output_logs_folder)
- logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/{0}.log'.format(p_output), level=logging.DEBUG)
- # init solution (`n` attributes)
- def init():
- return BinarySolution([], problem_size).random(validator)
- # define evaluate function here (need of data information)
- def evaluate(solution):
- start = datetime.datetime.now()
- # get indices of filters data to use (filters selection from solution)
- indices = []
- for index, value in enumerate(solution._data):
- if value == 1:
- indices.append(index)
- print(f'Training SVM with {len(indices)} from {len(solution._data)} available features')
- # keep only selected filters from solution
- x_train_filters = X_train[:, indices]
- x_test_filters = X_test[ :, indices]
-
- # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
- model = train_model(x_train_filters, y_train)
- y_test_model = model.predict(x_test_filters)
- y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
- test_roc_auc = roc_auc_score(y_test, y_test_predict)
- end = datetime.datetime.now()
- diff = end - start
- print("Real evaluation took: {}, score found: {}".format(divmod(diff.days * 86400 + diff.seconds, 60), test_roc_auc))
- return test_roc_auc
- def sub_evaluate(solution, index_number, targeted_indices):
- start = datetime.datetime.now()
- # get indices of filters data to use (filters selection from solution)
- indices = []
- for index, value in enumerate(solution._data):
- if value == 1:
- indices.append(targeted_indices[index])
- print(f'Training sub-model SVM n°{index_number} with {len(indices)} from {len(solution._data)} available features')
- # keep only selected filters from solution
- x_train_filters = X_train[:, indices]
- x_test_filters = X_test[ :, indices]
-
- # model = mdl.get_trained_model(p_choice, x_train_filters, y_train_filters)
- model = train_model(x_train_filters, y_train)
- y_test_model = model.predict(x_test_filters)
- y_test_predict = [ 1 if x > 0.5 else 0 for x in y_test_model ]
- test_roc_auc = roc_auc_score(y_test, y_test_predict)
- end = datetime.datetime.now()
- diff = end - start
- print(f"Real sub-evaluation n°{index_number} took: {divmod(diff.days * 86400 + diff.seconds, 60)}, score found: {test_roc_auc}")
- return test_roc_auc
- # build all output folder and files based on `output` name
- backup_model_folder = os.path.join(cfg.output_backup_folder, p_output)
- surrogate_output_model = os.path.join(cfg.output_surrogates_model_folder, p_output)
- surrogate_output_data = os.path.join(cfg.output_surrogates_data_folder, p_output)
- if not os.path.exists(backup_model_folder):
- os.makedirs(backup_model_folder)
- if not os.path.exists(cfg.output_surrogates_model_folder):
- os.makedirs(cfg.output_surrogates_model_folder)
- if not os.path.exists(cfg.output_surrogates_data_folder):
- os.makedirs(cfg.output_surrogates_data_folder)
- backup_file_path = os.path.join(backup_model_folder, p_output + '.csv')
- ucb_backup_file_path = os.path.join(backup_model_folder, p_output + '_ucbPolicy.csv')
- surrogate_backup_file_path = os.path.join(backup_model_folder, p_output + '_train.csv')
- surrogate_k_indices_backup_file_path = os.path.join(backup_model_folder, p_output + '_k_indices.csv')
- surrogate_population_backup_file_path = os.path.join(backup_model_folder, p_output + '_population.csv')
- # prepare optimization algorithm (only use of mutation as only ILS are used here, and local search need only local permutation)
- operators = [SimpleBinaryMutation(), SimpleMutation()]
- #policy = UCBPolicy(operators)
- policy = RandomPolicy(operators)
- # custom start surrogate variable based on problem size
- p_start = int(problem_size / p_k_division * 2) # 2 \times number of features for each sub-model
- # fixed minimal number of real evaluations
- if p_start < 50:
- p_start = 50
- print(f'Starting using surrogate after {p_start} reals training')
- # custom ILS for surrogate use
- algo = ILSMultiSpecificSurrogate(initalizer=init,
- evaluator=evaluate, # same evaluator by defadefaultult, as we will use the surrogate function
- sub_evaluator=sub_evaluate,
- operators=operators,
- policy=policy,
- validator=validator,
- output_log_surrogates=os.path.join(cfg.output_surrogates_data_folder, 'logs', p_output),
- surrogates_file_path=surrogate_output_model,
- start_train_surrogates=p_start, # start learning and using surrogate after 1000 real evaluation
- solutions_folder=surrogate_output_data,
- ls_train_surrogates=p_every_ls, # retrain surrogate every `x` iteration
- k_division=p_k_division,
- k_dynamic=p_k_dynamic,
- k_random=p_k_random,
- generate_only=p_generate_only,
- maximise=True)
-
- algo.addCallback(BasicCheckpoint(every=1, filepath=backup_file_path))
- #algo.addCallback(UCBCheckpoint(every=1, filepath=ucb_backup_file_path))
- algo.addCallback(SurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_backup_file_path)) # try every LS like this
- algo.addCallback(MultiSurrogateCheckpoint(every=p_ls_iteration, filepath=surrogate_k_indices_backup_file_path)) # try every LS like this
- algo.addCallback(MultiSurrogateSpecificCheckpoint(every=p_ls_iteration, filepath=surrogate_population_backup_file_path)) # try every LS like this
- bestSol = algo.run(p_ils_iteration, p_ls_iteration)
- # print best solution found
- print("Found ", bestSol)
- # save model information into .csv file
- if not os.path.exists(cfg.results_information_folder):
- os.makedirs(cfg.results_information_folder)
- filename_path = os.path.join(cfg.results_information_folder, cfg.optimization_attributes_result_filename)
- line_info = p_data_file + ';' + str(p_ils_iteration) + ';' + str(p_ls_iteration) + ';' + str(bestSol._data) + ';' + str(list(bestSol._data).count(1)) + ';' + str(bestSol.fitness())
- with open(filename_path, 'a') as f:
- f.write(line_info + '\n')
-
- print('Result saved into %s' % filename_path)
- if __name__ == "__main__":
- main()
|