Parcourir la source

Merge branch 'release/v0.0.3'

Jérôme BUISINE il y a 4 ans
Parent
commit
f5062a0387

Fichier diff supprimé car celui-ci est trop grand
+ 174 - 0
analyses/Untitled.ipynb


Fichier diff supprimé car celui-ci est trop grand
+ 271 - 0
analyses/data_augmentation_analysis.ipynb


Fichier diff supprimé car celui-ci est trop grand
+ 1774 - 0
analyses/edge_detection_analysis.ipynb


+ 23 - 6
models.py

@@ -4,14 +4,14 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.feature_selection import RFECV
+from sklearn.feature_selection import RFECV, RFE
 from sklearn.metrics import roc_auc_score
 import sklearn.svm as svm
 
 def _roc_auc_scorer(estimator, X, y):
-    
+
     y_pred = estimator.predict(X)
-    
+
     return roc_auc_score(y, y_pred)
 
 def _get_best_model(X_train, y_train):
@@ -33,7 +33,7 @@ def svm_model(X_train, y_train):
 
     return _get_best_model(X_train, y_train)
 
-def rfe_svm_model(X_train, y_train, n_components=1):
+def rfecv_svm_model(X_train, y_train, n_components=1):
 
     Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
     gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
@@ -47,10 +47,27 @@ def rfe_svm_model(X_train, y_train, n_components=1):
     return clf.best_estimator_
 
 
-def get_trained_model(choice, X_train, y_train):
+def rfe_svm_model(X_train, y_train, n_components=1):
+
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
+    param_grid = [{'estimator__C': Cs, 'estimator__gamma' : gammas}]
+
+    estimator = svm.SVC(kernel="linear")
+    selector = RFE(estimator, step=1, n_features_to_select=n_components, verbose=1)
+    clf = GridSearchCV(selector, param_grid, cv=5, verbose=1, scoring=_roc_auc_scorer)
+    clf.fit(X_train, y_train)
+
+    return clf.best_estimator_
+
+
+def get_trained_model(choice, X_train, y_train, n_components=1):
 
     if choice == 'svm_model':
         return svm_model(X_train, y_train)
 
     if choice == 'rfe_svm_model':
-        return rfe_svm_model(X_train, y_train)
+        return rfe_svm_model(X_train, y_train, n_components)
+
+    if choice == 'rfcecv_svm_model':
+        return rfecv_svm_model(X_train, y_train, n_components)

+ 3 - 1
train_model.py

@@ -36,12 +36,14 @@ def main():
     parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
     parser.add_argument('--output', type=str, help='output file name desired for model (without .joblib extension)')
     parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list)
+    parser.add_argument('--components', type=int, help='number of components expected by model', default=1)
 
     args = parser.parse_args()
 
     p_data_file = args.data
     p_output    = args.output
     p_choice    = args.choice
+    p_components = args.components
 
     if not os.path.exists(output_model_folder):
         os.makedirs(output_model_folder)
@@ -86,7 +88,7 @@ def main():
     # 2. Construction of the model : Ensemble model structure
     #######################
 
-    model = mdl.get_trained_model(p_choice, x_dataset_train, y_dataset_train)
+    model = mdl.get_trained_model(p_choice, x_dataset_train, y_dataset_train, p_components)
     indices = model.support_
 
     selected_indices = [(i+1) for i in np.arange(len(indices)) if indices[i] == True]