Przeglądaj źródła

Updates of output model comparisons

Jerome Buisine 6 lat temu
rodzic
commit
815c5c8601

+ 0 - 63
generateAndTrainEnsemble_random.sh

@@ -1,63 +0,0 @@
-#! bin/bash
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need of vector size"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of model output name"
-    exit 1
-fi
-
-VECTOR_SIZE=$1
-INPUT_MODEL_NAME=$2
-
-# selection of six scenes
-scenes="A, B, C, D, E, F, G, H, I"
-
-for size in {"4","8","16","26","32","40"}; do
-
-  start=0
-  for counter in {0..4}; do
-    end=$(($start+$size))
-
-    if [ "$end" -gt "$VECTOR_SIZE" ]; then
-        start=$(($VECTOR_SIZE-$size))
-        end=$(($VECTOR_SIZE))
-    fi
-
-    for nb_zones in {2,3,4,5,6,7,8,9,10}; do
-
-        for metric in {"lab","mscn"}; do
-    
-            for mode in {"svd","svdn","svdne"}; do
-
-                FILENAME="data/data_${mode}_${metric}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_random"
-                MODEL_NAME="${INPUT_MODEL_NAME}_${mode}_${metric}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}"
-
-                echo $FILENAME
-                python generate_data_model_random.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --sep ';' --rowindex '0'
-                python ensemble_model_train.py --data ${FILENAME}.train --output ${MODEL_NAME}
-                bash testModelByScene.sh "${start}" "${end}" "saved_models/${MODEL_NAME}.joblib" "${mode}" "${metric}" >> "saved_models/${MODEL_NAME}.tex"
-
-            done
-        done
-    done
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of model output name"
-    exit 1
-fi
-
-VECTOR_SIZE=$1
-INPUT_MODEL_NAME=$2
-    start=$(($start+50))
-  done
-
-done

+ 0 - 56
generateAndTrainSVM.sh

@@ -1,56 +0,0 @@
-#! bin/bash
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need of vector size"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of model output name"
-    exit 1
-fi
-
-VECTOR_SIZE=$1
-INPUT_MODEL_NAME=$2
-
-# selection of six scenes
-scenes="A, B, C, D, E, G"
-
-for size in {"4","8","16","26","32","40"}; do
-
-  start=0
-  for counter in {0..4}; do
-    end=$(($start+$size))
-
-    if [ "$end" -gt "$VECTOR_SIZE" ]; then
-        start=$(($VECTOR_SIZE-$size))
-        end=$(($VECTOR_SIZE))
-    fi
-
-    for zones in {"1, 3, 7, 9","0, 2, 7, 8, 9","2, 6, 8, 10, 13, 15","1, 2, 4, 7, 9, 10, 13, 15"}; do
-
-        zones_str="${zones//, /-}"
-
-        for metric in {"lab","mscn"}; do
-
-            for mode in {"svd","svdn","svdne"}; do
-
-                FILENAME="data/data_${mode}_${metric}_N${size}_B${start}_E${end}_zones${zones_str}"
-                MODEL_NAME="saved_models/${INPUT_MODEL_NAME}_${mode}_${metric}_N${size}_B${start}_E${end}_zones_${zones_str}"
-
-                echo $FILENAME
-                python generate_data_model.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --zones "${zones}" --percent 1 --sep ';' --rowindex '0'
-                python svm_model_train.py --data ${FILENAME}.train --output ${MODEL_NAME} &
-
-            done
-        done
-    done
-
-    start=$(($start+50))
-  done
-
-done

+ 0 - 64
generateAndTrainSVM_random.sh

@@ -1,64 +0,0 @@
-#! bin/bash
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need of vector size"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of model output name"
-    exit 1
-fi
-
-VECTOR_SIZE=$1
-INPUT_MODEL_NAME=$2
-
-# selection of six scenes
-scenes="A, B, C, D, E, F, G, H, I"
-
-for size in {"4","8","16","26","32","40"}; do
-
-  start=0
-  for counter in {0..4}; do
-    end=$(($start+$size))
-
-    if [ "$end" -gt "$VECTOR_SIZE" ]; then
-        start=$(($VECTOR_SIZE-$size))
-        end=$(($VECTOR_SIZE))
-    fi
-
-    for nb_zones in {2,3,4,5,6,7,8,9,10}; do
-
-        for metric in {"lab","mscn"}; do
-            
-            for mode in {"svd","svdn","svdne"}; do
-
-                FILENAME="data/data_${mode}_${metric}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_random"
-                MODEL_NAME="${INPUT_MODEL_NAME}_${mode}_${metric}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}"
-
-                echo $FILENAME
-                python generate_data_model_random.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --sep ';' --rowindex '0'
-                python svm_model_train.py --data ${FILENAME}.train --output ${MODEL_NAME} &
-
-                # add computation of scenes score and LaTeX display of its
-
-            done
-        done
-    done
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of model output name"
-    exit 1
-fi
-
-VECTOR_SIZE=$1
-INPUT_MODEL_NAME=$2
-    start=$(($start+50))
-  done
-
-done

+ 5 - 5
generateAndTrain_maxwell.sh

@@ -48,15 +48,15 @@ for counter in {0..4}; do
                 MODEL_NAME="${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${metric}_${mode}"
 
                 echo $FILENAME
-                
+
                 # only compute if necessary (perhaps server will fall.. Just in case)
                 if grep -q "${MODEL_NAME}" "${result_filename}"; then
 
                     echo "${MODEL_NAME} results already generated..."
                 else
                     python generate_data_model_random_maxwell.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --sep ';' --rowindex '0'
-                    python models/${model}_train.py --data ${FILENAME}.train --output ${MODEL_NAME}
-                
+                    python models/${model}_train.py --data ${FILENAME} --output ${MODEL_NAME}
+
                     python predict_seuil_expe_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric} --limit_detection '2'
                     python save_model_result_in_md_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric}
                 fi
@@ -66,8 +66,8 @@ for counter in {0..4}; do
 
     if [ "$counter" -eq "0" ]; then
         start=$(($start+50-$half))
-    else 
+    else
         start=$(($start+50))
     fi
 
-done
+done

+ 92 - 37
models/ensemble_model_train.py

@@ -6,12 +6,11 @@ from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 import sklearn.svm as svm
 from sklearn.utils import shuffle
 from sklearn.externals import joblib
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.model_selection import cross_val_score
 
 import numpy as np
-
 import pandas as pd
-from sklearn.metrics import accuracy_score
-
 import sys, os, getopt
 
 saved_models_folder = 'saved_models'
@@ -19,13 +18,13 @@ current_dirpath = os.getcwd()
 output_model_folder = os.path.join(current_dirpath, saved_models_folder)
 
 def get_best_model(X_train, y_train):
-    Cs = [0.001, 0.01, 0.1, 1, 10]
-    gammas = [0.001, 0.01, 0.1, 1]
+
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
-    
-    parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
-    svc = svm.SVC(gamma="scale", probability=True)
-    clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
+
+    svc = svm.SVC(probability=True)
+    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
 
     clf.fit(X_train, y_train)
 
@@ -60,53 +59,109 @@ def main():
     if not os.path.exists(output_model_folder):
         os.makedirs(output_model_folder)
 
-    # get and split data
-    dataset = pd.read_csv(p_data_file, header=None, sep=";")
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
+    dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
+
+    # default first shuffle of data
+    dataset_train = shuffle(dataset_train)
+    dataset_test = shuffle(dataset_test)
 
-     # default first shuffle of data
-    dataset = shuffle(dataset)
-    
     # get dataset with equal number of classes occurences
-    noisy_df = dataset[dataset.ix[:, 0] == 1]
-    not_noisy_df = dataset[dataset.ix[:, 0] == 0]
-    nb_noisy = len(noisy_df.index)
-    
-    final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
-    #final_df = pd.concat([not_noisy_df, noisy_df])
-    
+    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    nb_noisy_train = len(noisy_df_train.index)
+
+    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    nb_noisy_test = len(noisy_df_test.index)
+
+    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+
     # shuffle data another time
-    final_df = shuffle(final_df)
-    
-    print(len(final_df.index))
+    final_df_train = shuffle(final_df_train)
+    final_df_test = shuffle(final_df_test)
 
-    y_dataset = final_df.ix[:,0]
-    x_dataset = final_df.ix[:,1:]
+    final_df_train_size = len(final_df_train.index)
+    final_df_test_size = len(final_df_test.index)
 
     # use of the whole data set for training
-    X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0., random_state=42)
+    x_dataset_train = final_df_train.ix[:,1:]
+    x_dataset_test = final_df_test.ix[:,1:]
+
+    y_dataset_train = final_df_train.ix[:,0]
+    y_dataset_test = final_df_test.ix[:,0]
 
-    svm_model = get_best_model(X_train, y_train)
+    #######################
+    # 2. Construction of the model : Ensemble model structure
+    #######################
+
+    svm_model = get_best_model(x_dataset_train, y_dataset_train)
 
     lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
     rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
 
     ensemble_model = VotingClassifier(estimators=[
-       ('svm', svm_model), ('lr', lr_model), ('rf', rf_model)],
-       voting='soft', weights=[1,1,1])
+       ('svm', svm_model), ('lr', lr_model), ('rf', rf_model)], voting='soft', weights=[1,1,1])
+
+    #######################
+    # 3. Fit model : use of cross validation to fit model
+    #######################
+    print("-------------------------------------------")
+    print("Train dataset size: ", final_df_train_size)
+    ensemble_model.fit(x_dataset_train, y_dataset_train)
+    val_scores = cross_val_score(ensemble_model, x_dataset_train, y_dataset_train, cv=5)
+    print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
+
+    ######################
+    # 4. Test : Validation and test dataset from .test dataset
+    ######################
+
+    # we need to specify validation size to 20% of whole dataset
+    val_set_size = int(final_df_train_size/3)
+    test_set_size = val_set_size
+
+    total_validation_size = val_set_size + test_set_size
+
+    if final_df_test_size > total_validation_size:
+        x_dataset_test = x_dataset_test[0:total_validation_size]
+        y_dataset_test = y_dataset_test[0:total_validation_size]
+
+    X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
+
+    y_test_model = ensemble_model.predict(X_test)
+    y_val_model = ensemble_model.predict(X_val)
+
+    val_accuracy = accuracy_score(y_val, y_val_model)
+    test_accuracy = accuracy_score(y_test, y_test_model)
+
+    val_f1 = f1_score(y_val, y_val_model)
+    test_f1 = f1_score(y_test, y_test_model)
+
+
+    ###################
+    # 5. Output : Print and write all information in csv
+    ###################
 
-    ensemble_model.fit(X_train, y_train)
+    print("Validation dataset size ", val_set_size)
+    print("Validation: ", val_accuracy)
+    print("Validation F1: ", val_f1)
+    print("Test dataset size ", test_set_size)
+    print("Test: ", val_accuracy)
+    print("Test F1: ", test_f1)
 
-    y_train_model = ensemble_model.predict(X_train)
-    print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
 
-    #y_pred = ensemble_model.predict(X_test)
-    #print("**Test :** " + str(accuracy_score(y_test, y_pred)))
+    ##################
+    # 6. Save model : create path if not exists
+    ##################
 
-    # create path if not exists
     if not os.path.exists(saved_models_folder):
         os.makedirs(saved_models_folder)
 
-    joblib.dump(ensemble_model, output_model_folder + '/' + p_output + '.joblib') 
+    joblib.dump(ensemble_model, output_model_folder + '/' + p_output + '.joblib')
 
 if __name__== "__main__":
     main()

+ 94 - 39
models/ensemble_model_train_v2.py

@@ -8,27 +8,25 @@ from sklearn.ensemble import GradientBoostingClassifier
 import sklearn.svm as svm
 from sklearn.utils import shuffle
 from sklearn.externals import joblib
+from sklearn.metrics import accuracy_score, f1_score
 
-import numpy as np
+from sklearn.model_selection import cross_val_score
 
+import numpy as np
 import pandas as pd
-from sklearn.metrics import accuracy_score
-
 import sys, os, getopt
 
-
 saved_models_folder = 'saved_models'
 current_dirpath = os.getcwd()
 output_model_folder = os.path.join(current_dirpath, saved_models_folder)
 
 def get_best_model(X_train, y_train):
-    Cs = [0.001, 0.01, 0.1, 1, 10, 20, 30]
-    gammas = [0.001, 0.01, 0.1, 1, 5, 10]
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
-    
-    parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
-    svc = svm.SVC(gamma="scale", probability=True, max_iter=10000)
-    clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
+
+    svc = svm.SVC(probability=True)
+    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
 
     clf.fit(X_train, y_train)
 
@@ -63,57 +61,114 @@ def main():
     if not os.path.exists(output_model_folder):
         os.makedirs(output_model_folder)
 
-    # get and split data
-    dataset = pd.read_csv(p_data_file, header=None, sep=";")
+    # 1. Get and prepare data
+    dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
+    dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
+
+    # default first shuffle of data
+    dataset_train = shuffle(dataset_train)
+    dataset_test = shuffle(dataset_test)
 
-     # default first shuffle of data
-    dataset = shuffle(dataset)
-    
     # get dataset with equal number of classes occurences
-    noisy_df = dataset[dataset.ix[:, 0] == 1]
-    not_noisy_df = dataset[dataset.ix[:, 0] == 0]
-    nb_noisy = len(noisy_df.index)
-    
-    final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df[:]])
-    #final_df = pd.concat([not_noisy_df, noisy_df])
-    
+    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    nb_noisy_train = len(noisy_df_train.index)
+
+    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    nb_noisy_test = len(noisy_df_test.index)
+
+    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+
     # shuffle data another time
-    final_df = shuffle(final_df)
-    
-    print(len(final_df.index))
+    final_df_train = shuffle(final_df_train)
+    final_df_test = shuffle(final_df_test)
+
+    final_df_train_size = len(final_df_train.index)
+    final_df_test_size = len(final_df_test.index)
+
+    # use of the whole data set for training
+    x_dataset_train = final_df_train.ix[:,1:]
+    x_dataset_test = final_df_test.ix[:,1:]
+
+    y_dataset_train = final_df_train.ix[:,0]
+    y_dataset_test = final_df_test.ix[:,0]
 
-    y_dataset = final_df.ix[:,0]
-    x_dataset = final_df.ix[:,1:]
-    
-    X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0., random_state=42)
 
-    svm_model = get_best_model(X_train, y_train)
+    #######################
+    # 2. Construction of the model : Ensemble model structure
+    #######################
+
+    svm_model = get_best_model(y_dataset_train, y_dataset_train)
     knc_model = KNeighborsClassifier(n_neighbors=2)
     gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
     lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
     rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
 
     ensemble_model = VotingClassifier(estimators=[
-       ('lr', lr_model), 
+       ('lr', lr_model),
        ('knc', knc_model),
        ('gbc', gbc_model),
-       ('svm', svm_model), 
-       ('rf', rf_model)], 
+       ('svm', svm_model),
+       ('rf', rf_model)],
        voting='soft', weights=[1, 1, 1, 1, 1])
 
-    ensemble_model.fit(X_train, y_train)
 
-    y_train_model = ensemble_model.predict(X_train)
-    print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
+    #######################
+    # 3. Fit model : use of cross validation to fit model
+    #######################
+    print("-------------------------------------------")
+    print("Train dataset size: ", final_df_train_size)
+    ensemble_model.fit(x_dataset_train, y_dataset_train)
+    val_scores = cross_val_score(ensemble_model, x_dataset_train, y_dataset_train, cv=5)
+    print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
+
+    ######################
+    # 4. Test : Validation and test dataset from .test dataset
+    ######################
+
+    # we need to specify validation size to 20% of whole dataset
+    val_set_size = int(final_df_train_size/3)
+    test_set_size = val_set_size
+
+    total_validation_size = val_set_size + test_set_size
+
+    if final_df_test_size > total_validation_size:
+        x_dataset_test = x_dataset_test[0:total_validation_size]
+        y_dataset_test = y_dataset_test[0:total_validation_size]
+
+    X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
+
+    y_test_model = ensemble_model.predict(X_test)
+    y_val_model = ensemble_model.predict(X_val)
+
+    val_accuracy = accuracy_score(y_val, y_val_model)
+    test_accuracy = accuracy_score(y_test, y_test_model)
+
+    val_f1 = f1_score(y_val, y_val_model)
+    test_f1 = f1_score(y_test, y_test_model)
+
+    ###################
+    # 5. Output : Print and write all information in csv
+    ###################
+
+    print("Validation dataset size ", val_set_size)
+    print("Validation: ", val_accuracy)
+    print("Validation F1: ", val_f1)
+    print("Test dataset size ", test_set_size)
+    print("Test: ", val_accuracy)
+    print("Test F1: ", test_f1)
 
-    #y_pred = ensemble_model.predict(X_test)
-    #print("**Test :** " + str(accuracy_score(y_test, y_pred)))
+    ##################
+    # 6. Save model : create path if not exists
+    ##################
 
     # create path if not exists
     if not os.path.exists(saved_models_folder):
         os.makedirs(saved_models_folder)
 
-    joblib.dump(ensemble_model, output_model_folder + '/' +  p_output + '.joblib') 
+    joblib.dump(ensemble_model, output_model_folder + '/' +  p_output + '.joblib')
 
 if __name__== "__main__":
     main()

+ 94 - 29
models/svm_model_train.py

@@ -1,16 +1,16 @@
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
-
-from sklearn.utils import shuffle
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
 import sklearn.svm as svm
+from sklearn.utils import shuffle
 from sklearn.externals import joblib
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.model_selection import cross_val_score
 
 import numpy as np
-
 import pandas as pd
-from sklearn.metrics import accuracy_score
-
 import sys, os, getopt
 
 saved_models_folder = 'saved_models'
@@ -18,10 +18,13 @@ current_dirpath = os.getcwd()
 output_model_folder = os.path.join(current_dirpath, saved_models_folder)
 
 def get_best_model(X_train, y_train):
-    
-    parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
-    svc = svm.SVC(gamma="scale")
-    clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
+
+    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
+    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
+
+    svc = svm.SVC(probability=True)
+    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
 
     clf.fit(X_train, y_train)
 
@@ -56,41 +59,103 @@ def main():
     if not os.path.exists(output_model_folder):
         os.makedirs(output_model_folder)
 
-    dataset = pd.read_csv(p_data_file, header=None, sep=";")
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
+    dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
 
     # default first shuffle of data
-    dataset = shuffle(dataset)
-    
+    dataset_train = shuffle(dataset_train)
+    dataset_test = shuffle(dataset_test)
+
     # get dataset with equal number of classes occurences
-    noisy_df = dataset[dataset.ix[:, 0] == 1]
-    not_noisy_df = dataset[dataset.ix[:, 0] == 0]
-    nb_noisy = len(noisy_df.index)
+    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    nb_noisy_train = len(noisy_df_train.index)
+
+    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    nb_noisy_test = len(noisy_df_test.index)
+
+    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
 
-    final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
-    #final_df = pd.concat([not_noisy_df, noisy_df])
-  
     # shuffle data another time
-    final_df = shuffle(final_df)
+    final_df_train = shuffle(final_df_train)
+    final_df_test = shuffle(final_df_test)
 
-    y_dataset = final_df.ix[:,0]
-    x_dataset = final_df.ix[:,1:]
+    final_df_train_size = len(final_df_train.index)
+    final_df_test_size = len(final_df_test.index)
 
     # use of the whole data set for training
-    X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0., random_state=42)
+    x_dataset_train = final_df_train.ix[:,1:]
+    x_dataset_test = final_df_test.ix[:,1:]
+
+    y_dataset_train = final_df_train.ix[:,0]
+    y_dataset_test = final_df_test.ix[:,0]
+
+    #######################
+    # 2. Construction of the model : Ensemble model structure
+    #######################
 
-    svm_model = get_best_model(X_train, y_train)
+    svm_model = get_best_model(x_dataset_train, y_dataset_train)
 
-    y_train_model = svm_model.predict(X_train)
-    print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
+    #######################
+    # 3. Fit model : use of cross validation to fit model
+    #######################
+    print("-------------------------------------------")
+    print("Train dataset size: ", final_df_train_size)
+    svm_model.fit(x_dataset_train, y_dataset_train)
+    val_scores = cross_val_score(svm_model, x_dataset_train, y_dataset_train, cv=5)
+    print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
 
-    #y_pred = svm_model.predict(X_test)
-    #print("**Test :** " + str(accuracy_score(y_test, y_pred)))
+    ######################
+    # 4. Test : Validation and test dataset from .test dataset
+    ######################
+
+    # we need to specify validation size to 20% of whole dataset
+    val_set_size = int(final_df_train_size/3)
+    test_set_size = val_set_size
+
+    total_validation_size = val_set_size + test_set_size
+
+    if final_df_test_size > total_validation_size:
+        x_dataset_test = x_dataset_test[0:total_validation_size]
+        y_dataset_test = y_dataset_test[0:total_validation_size]
+
+    X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
+
+    y_test_model = svm_model.predict(X_test)
+    y_val_model = svm_model.predict(X_val)
+
+    val_accuracy = accuracy_score(y_val, y_val_model)
+    test_accuracy = accuracy_score(y_test, y_test_model)
+
+    val_f1 = f1_score(y_val, y_val_model)
+    test_f1 = f1_score(y_test, y_test_model)
+
+
+    ###################
+    # 5. Output : Print and write all information in csv
+    ###################
+
+    print("Validation dataset size ", val_set_size)
+    print("Validation: ", val_accuracy)
+    print("Validation F1: ", val_f1)
+    print("Test dataset size ", test_set_size)
+    print("Test: ", val_accuracy)
+    print("Test F1: ", test_f1)
+
+    ##################
+    # 6. Save model : create path if not exists
+    ##################
 
     # create path if not exists
     if not os.path.exists(saved_models_folder):
         os.makedirs(saved_models_folder)
-        
-    joblib.dump(svm_model, output_model_folder + '/' + p_output + '.joblib') 
+
+    joblib.dump(svm_model, output_model_folder + '/' + p_output + '.joblib')
 
 if __name__== "__main__":
     main()

+ 1 - 1
runAll_maxwell.sh

@@ -7,7 +7,7 @@ mkdir -p models_info
 touch ${file_path}
 
 # add of header
-echo 'model_name; vector_size; start; end; nb_zones; metric; mode; train; test; global' >> ${file_path}
+echo 'model_name; vector_size; start; end; nb_zones; metric; mode; train; val; test; F1_train; F1_val; F1_test' >> ${file_path}
 
 for size in {"4","8","16","26","32","40"}; do
 

+ 91 - 30
save_model_result_in_md_maxwell.py

@@ -52,25 +52,25 @@ def main():
         else:
             assert False, "unhandled option"
 
-    
+
     # call model and get global result in scenes
 
     begin, end = p_interval
 
-    bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'" 
+    bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
     print(bash_cmd)
-     
+
     ## call command ##
     p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
-    
+
     (output, err) = p.communicate()
-    
+
     ## Wait for result ##
     p_status = p.wait()
 
     if not os.path.exists(markdowns_folder):
         os.makedirs(markdowns_folder)
-        
+
     # get model name to construct model
     md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace('.joblib', '.md'))
 
@@ -87,7 +87,7 @@ def main():
 
             # get all map information
             for t_map_file in maps_files:
-                
+
                 file_path = os.path.join(model_map_info_path, t_map_file)
                 with open(file_path, 'r') as map_file:
 
@@ -100,56 +100,117 @@ def main():
                         f.write(line)
 
         f.close()
-    
+
     # Keep model information to compare
     current_model_name = p_model_file.split('/')[-1].replace('.joblib', '')
 
+    # Prepare writing in .csv file
     output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
     output_final_file = open(output_final_file_path, "a")
 
     print(current_model_name)
-    # reconstruct data filename 
+    # reconstruct data filename
     for name in models_name:
         if name in current_model_name:
             current_data_file_path = os.path.join('data', current_model_name.replace(name, 'data_maxwell'))
-    
-    data_filenames = [current_data_file_path + '.train', current_data_file_path + '.test', 'all']
 
-    accuracy_scores = []
+    model_scores = []
 
-    # go ahead each file
-    for data_file in data_filenames:
+    ########################
+    # 1. Get and prepare data
+    ########################
+    dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
+    dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
 
-        if data_file == 'all':
+    # default first shuffle of data
+    dataset_train = shuffle(dataset_train)
+    dataset_test = shuffle(dataset_test)
 
-            dataset_train = pd.read_csv(data_filenames[0], header=None, sep=";")
-            dataset_test = pd.read_csv(data_filenames[1], header=None, sep=";")
-        
-            dataset = pd.concat([dataset_train, dataset_test])
-        else:
-            dataset = pd.read_csv(data_file, header=None, sep=";")
+    # get dataset with equal number of classes occurences
+    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    nb_noisy_train = len(noisy_df_train.index)
+
+    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    nb_noisy_test = len(noisy_df_test.index)
+
+    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+
+    # shuffle data another time
+    final_df_train = shuffle(final_df_train)
+    final_df_test = shuffle(final_df_test)
+
+    final_df_train_size = len(final_df_train.index)
+    final_df_test_size = len(final_df_test.index)
 
-        y_dataset = dataset.ix[:,0]
-        x_dataset = dataset.ix[:,1:]
+    # use of the whole data set for training
+    x_dataset_train = final_df_train.ix[:,1:]
+    x_dataset_test = final_df_test.ix[:,1:]
 
-        model = joblib.load(p_model_file)
+    y_dataset_train = final_df_train.ix[:,0]
+    y_dataset_test = final_df_test.ix[:,0]
 
-        y_pred = model.predict(x_dataset)   
+    #######################
+    # 2. Getting model
+    #######################
 
-        # add of score obtained
-        accuracy_scores.append(accuracy_score(y_dataset, y_pred))
+    model = joblib.load(p_model_file)
+
+    #######################
+    # 3. Fit model : use of cross validation to fit model
+    #######################
+    model.fit(x_dataset_train, y_dataset_train)
+    val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
+
+    ######################
+    # 4. Test : Validation and test dataset from .test dataset
+    ######################
+
+    # we need to specify validation size to 20% of whole dataset
+    val_set_size = int(final_df_train_size/3)
+    test_set_size = val_set_size
+
+    total_validation_size = val_set_size + test_set_size
+
+    if final_df_test_size > total_validation_size:
+        x_dataset_test = x_dataset_test[0:total_validation_size]
+        y_dataset_test = y_dataset_test[0:total_validation_size]
+
+    X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
+
+    y_test_model = ensemble_model.predict(X_test)
+    y_val_model = ensemble_model.predict(X_val)
+
+    val_accuracy = accuracy_score(y_val, y_val_model)
+    test_accuracy = accuracy_score(y_test, y_test_model)
+
+    y_train_model = model.predict(x_dataset_train)
+    train_f1 = f1_score(y_dataset_train, y_train_model)
+    val_f1 = f1_score(y_val, y_val_model)
+    test_f1 = f1_score(y_test, y_test_model)
+
+    # add of scores
+    model_scores.append(val_scores.mean())
+    model_scores.append(val_accuracy)
+    model_scores.append(test_accuracy)
+
+    model_scores.append(train_f1)
+    model_scores.append(val_f1)
+    model_scores.append(test_f1)
 
     # TODO : improve...
     # check if it's always the case...
     nb_zones = data_filenames[0].split('_')[7]
 
     final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
-    
-    for s in accuracy_scores:
+
+    for s in model_scores:
         final_file_line += '; ' + str(s)
 
     output_final_file.write(final_file_line + '\n')
 
 
 if __name__== "__main__":
-    main()
+    main()