Parcourir la source

Update of model outputs

Jerome Buisine il y a 5 ans
Parent
commit
c2a23627c2

+ 1 - 1
generateAndTrain_maxwell.sh

@@ -57,7 +57,7 @@ for counter in {0..4}; do
                     python generate_data_model_random_maxwell.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --sep ';' --rowindex '0'
                     python models/${model}_train.py --data ${FILENAME} --output ${MODEL_NAME}
 
-                    python predict_seuil_expe_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric} --limit_detection '2'
+                    #python predict_seuil_expe_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric} --limit_detection '2'
                     python save_model_result_in_md_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric}
                 fi
             done

+ 1 - 1
models/ensemble_model_train_v2.py

@@ -100,7 +100,7 @@ def main():
     # 2. Construction of the model : Ensemble model structure
     #######################
 
-    svm_model = get_best_model(y_dataset_train, y_dataset_train)
+    svm_model = get_best_model(x_dataset_train, y_dataset_train)
     knc_model = KNeighborsClassifier(n_neighbors=2)
     gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
     lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)

+ 10 - 4
runAll_maxwell.sh

@@ -3,15 +3,21 @@
 # erase "models_info/models_comparisons.csv" file and write new header
 file_path='models_info/models_comparisons.csv'
 rm ${file_path}
-mkdir -p models_info
-touch ${file_path}
+
+erased=$1
+
+if [ "$erased" == "Y" ]; then
+    echo "Previous data file erased..."
+    mkdir -p models_info
+    touch ${file_path}
+fi
 
 # add of header
-echo 'model_name; vector_size; start; end; nb_zones; metric; mode; train; val; test; F1_train; F1_val; F1_test' >> ${file_path}
+echo 'model_name; vector_size; start; end; nb_zones; metric; mode; train_size; val_size; test_size; train_acc; val_acc; test_acc; mean_acc; F1_train; F1_val; F1_test; F1_mean' >> ${file_path}
 
 for size in {"4","8","16","26","32","40"}; do
 
     for metric in {"lab","mscn","mscn_revisited","low_bits_2","low_bits_3","low_bits_4"}; do
-        bash generateAndTrain_maxwell.sh ${size} ${metric} &
+        bash generateAndTrain_maxwell.sh ${size} ${metric}
     done
 done

+ 24 - 4
save_model_result_in_md_maxwell.py

@@ -1,5 +1,8 @@
+from sklearn.utils import shuffle
 from sklearn.externals import joblib
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import train_test_split
 
 import numpy as np
 import pandas as pd
@@ -180,8 +183,8 @@ def main():
 
     X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
 
-    y_test_model = ensemble_model.predict(X_test)
-    y_val_model = ensemble_model.predict(X_val)
+    y_test_model = model.predict(X_test)
+    y_val_model = model.predict(X_val)
 
     val_accuracy = accuracy_score(y_val, y_val_model)
     test_accuracy = accuracy_score(y_test, y_test_model)
@@ -191,18 +194,35 @@ def main():
     val_f1 = f1_score(y_val, y_val_model)
     test_f1 = f1_score(y_test, y_test_model)
 
+    # stats of all dataset
+    all_x_data = pd.concat([x_dataset_train, X_test, X_val])
+    all_y_data = pd.concat([y_dataset_train, y_test, y_val])
+
+    all_y_model = model.predict(all_x_data)
+    all_accuracy = accuracy_score(all_y_data, all_y_model)
+    all_f1_score = f1_score(all_y_data, all_y_model)
+
+    # stats of dataset sizes
+    total_samples = final_df_train_size + val_set_size + test_set_size
+
+    model_scores.append(final_df_train_size / total_samples)
+    model_scores.append(val_set_size / total_samples)
+    model_scores.append(test_set_size / total_samples)
+
     # add of scores
     model_scores.append(val_scores.mean())
     model_scores.append(val_accuracy)
     model_scores.append(test_accuracy)
+    model_scores.append(all_accuracy)
 
     model_scores.append(train_f1)
     model_scores.append(val_f1)
     model_scores.append(test_f1)
+    model_scores.append(all_f1_score)
 
     # TODO : improve...
     # check if it's always the case...
-    nb_zones = data_filenames[0].split('_')[7]
+    nb_zones = current_data_file_path.split('_')[7]
 
     final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode