Parcourir la source

Update of project and script

Jérôme BUISINE il y a 4 ans
Parent
commit
19d5f63345
7 fichiers modifiés avec 95 ajouts et 142 suppressions
  1. 1 0
      .gitignore
  2. 6 20
      README.md
  3. 0 25
      TODO.md
  4. 0 75
      modules/models.py
  5. 3 2
      modules/utils/config.py
  6. 22 3
      run.sh
  7. 63 17
      train_model.py

+ 1 - 0
.gitignore

@@ -7,6 +7,7 @@ __pycache__
 # by default avoid model files and png files
 *.h5
 *.png
+saved_models
 !saved_models/*.h5
 !saved_models/*.png
 

+ 6 - 20
README.md

@@ -8,37 +8,23 @@ pip install -r requirements.txt
 
 ## How to use
 
-Generate dataset (run only once time or clean data folder before):
+Generate reconstructed data from specific method of reconstruction (run only once time or clean data folder before):
 ```
-python generate_dataset.py
+python generate_reconstructed_data.py -h
 ```
 
-It will split scenes and generate all data you need for your neural network.
-You can specify the number of sub images you want in the script by modifying **_NUMBER_SUB_IMAGES_** variable or using parameter.
-
+Generate custom dataset from one reconstructed method or multiples (implemented later)
 ```
-python generate_dataset.py --nb xxxx
+python generate_dataset.py -h
 ```
 
-There are 3 kinds of Neural Networks:
-- **classification_cnn_keras.py**: *based on cropped images and do convolution*
-- **classification_cnn_keras_cross_validation.py**: *based on cropped images and do convolution. Data are randomly split for training*
-- **classification_cnn_keras_svd.py**: *based on svd metrics of image*
-
-
-After your built your neural network in classification_cnn_keras.py, you just have to run it:
-
-```
-python classification_cnn_keras_svd.py --directory xxxx --output xxxxx --batch_size xx --epochs xx --img xx (or --image_width xx --img_height xx)
-```
 
-A config file in json is available and keeps in memory all image sizes available.
 
 ## Modules
 
 This project contains modules:
-- **modules/image_metrics**: *where all computed metrics function are developed*
-- **modules/model_helper**: *contains helpful function to save or display model information and performance*
+- **modules/utils/config.py**: *Store all configuration information about the project and dataset information*
+- **modules/utils/data.py**: *Usefull methods used for data generation*
 
 All these modules will be enhanced during development of the project
 

+ 0 - 25
TODO.md

@@ -1,25 +0,0 @@
-# TODO :
-
-## Prépation des données
-
-- Séparer dans 2 dossiers les images (noisy, not noisy) 
-  - Par scène
-  - Par zone
-  - Par métrique [scene, zone]
-  
-- Transformer chaque image comme souhaitée (ici reconstruction SV avec 110 composantes faibles)
-- Pour chaque image ajouter sa forme sous 4 rotations (augmentation du nombre de données)
-
-## Chargement des données
-- Chargement de l'ensemble des images (association : "path", "label")
-- Mettre en place un équilibre de classes
-- Mélange de l'ensemble des données
-- Séparation des données (train, validation, test)
-
-## Conception du modèle
-- Mise en place d'un modèle CNN
-- Utilisation BatchNormalization / Dropout
-
-
-## Si non fonctionnel
-- Utilisation d'une approche transfer learning

+ 0 - 75
modules/models.py

@@ -1,75 +0,0 @@
-from sklearn.model_selection import GridSearchCV
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier, VotingClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-import sklearn.svm as svm
-
-
-def _get_best_model(X_train, y_train):
-
-    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
-    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
-
-    svc = svm.SVC(probability=True)
-    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
-
-    clf.fit(X_train, y_train)
-
-    model = clf.best_estimator_
-
-    return model
-
-def svm_model(X_train, y_train):
-
-    return _get_best_model(X_train, y_train)
-
-
-def ensemble_model(X_train, y_train):
-
-    svm_model = _get_best_model(X_train, y_train)
-
-    lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
-    rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
-
-    ensemble_model = VotingClassifier(estimators=[
-       ('svm', svm_model), ('lr', lr_model), ('rf', rf_model)], voting='soft', weights=[1,1,1])
-
-    ensemble_model.fit(X_train, y_train)
-
-    return ensemble_model
-
-
-def ensemble_model_v2(X_train, y_train):
-
-    svm_model = _get_best_model(X_train, y_train)
-    knc_model = KNeighborsClassifier(n_neighbors=2)
-    gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
-    lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
-    rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
-
-    ensemble_model = VotingClassifier(estimators=[
-       ('lr', lr_model),
-       ('knc', knc_model),
-       ('gbc', gbc_model),
-       ('svm', svm_model),
-       ('rf', rf_model)],
-       voting='soft', weights=[1, 1, 1, 1, 1])
-
-    ensemble_model.fit(X_train, y_train)
-
-    return ensemble_model
-
-def get_trained_model(choice, X_train, y_train):
-
-    if choice == 'svm_model':
-        return svm_model(X_train, y_train)
-
-    if choice == 'ensemble_model':
-        return ensemble_model(X_train, y_train)
-
-    if choice == 'ensemble_model_v2':
-        return ensemble_model_v2(X_train, y_train)
-
-

+ 3 - 2
modules/utils/config.py

@@ -40,5 +40,6 @@ zones_indices                   = np.arange(16)
 
 metric_choices_labels           = ['all', 'svd_reconstruction']
 
-keras_epochs                    = 500
-keras_batch                     = 32
+keras_epochs                    = 100
+keras_batch                     = 32
+val_dataset_size                = 0.2

+ 22 - 3
run.sh

@@ -1,7 +1,22 @@
 #!/bin/bash
 
+erased=$1
+
+if [ "${erased}" == "Y" ]; then
+    echo "Previous data file erased..."
+    rm ${file_path}
+    mkdir -p models_info
+    touch ${file_path}
+
+    # add of header
+    echo 'model_name; global_train_size; global_test_size; filtered_train_size; filtered_test_size; f1_train; f1_test; recall_train; recall_test; presicion_train; precision_test; acc_train; acc_test; roc_auc_train; roc_auc_test;' >> ${file_path}
+fi
+
 metric="svd_reconstruction"
 
+# file which contains model names we want to use for simulation
+comparisons_models="models_info/models_comparisons.csv"
+
 for begin in {80,85,90,95,100,105,110}; do
   for end in {150,160,170,180,190,200}; do
 
@@ -10,9 +25,13 @@ for begin in {80,85,90,95,100,105,110}; do
     for zone in {6,8,10,12}; do
       OUTPUT_DATA_FILE="${metric}_nb_zones_${zone}_B${begin}_E${end}"
 
-      python generate_dataset.py --output data/${OUTPUT_DATA_FILE} --metric ${metric} --renderer "maxwell" --scenes "A, D, G, H" --interval "${begin}, ${end}" --nb_zones ${zone} --random 1
-      
-      python train_model.py --data data/${OUTPUT_DATA_FILE} --output ${OUTPUT_DATA_FILE}
+      if grep -xq "${OUTPUT_DATA_FILE}" "${comparisons_models}"; then
+        
+        echo "Run simulation for model ${OUTPUT_DATA_FILE}"
+
+        python generate_dataset.py --output data/${OUTPUT_DATA_FILE} --metric ${metric} --renderer "maxwell" --scenes "A, D, G, H" --interval "${begin}, ${end}" --nb_zones ${zone} --random 1
+        
+        python train_model.py --data data/${OUTPUT_DATA_FILE} --output ${OUTPUT_DATA_FILE}
     done
   done
 done

+ 63 - 17
train_model.py

@@ -12,10 +12,12 @@ from keras.models import Sequential
 from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
 from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
 from keras import backend as K
+import tensorflow as tf
+
 from keras.utils import plot_model
 
 from modules.utils import config as cfg
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
 
 img_width, img_height = 200, 200
 batch_size = 32
@@ -26,6 +28,11 @@ if K.image_data_format() == 'channels_first':
 else:
     input_shape = (img_width, img_height, 1)
 
+def auc(y_true, y_pred):
+    auc = tf.metrics.auc(y_true, y_pred)[1]
+    K.get_session().run(tf.local_variables_initializer())
+    #K.get_session().run(tf.local_variables_initializer())
+    return auc
 
 def generate_model(_input_shape):
 
@@ -74,8 +81,8 @@ def generate_model(_input_shape):
     model.add(Activation('sigmoid'))
 
     model.compile(loss='binary_crossentropy',
-                  optimizer='rmsprop',
-                  metrics=['accuracy'])
+                  optimizer='adam',
+                  metrics=['accuracy', auc])
 
     return model
 
@@ -84,13 +91,19 @@ def main():
 
     parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
 
-    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
-    parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)')
+    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
+    parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
+    parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
+    parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
+    parser.add_argument('--val_size', type=int, help='percent of validation data during training process', default=cfg.val_dataset_size)
 
     args = parser.parse_args()
 
-    p_data_file = args.data
-    p_output    = args.output
+    p_data_file  = args.data
+    p_output     = args.output
+    p_batch_size = args.batch_size
+    p_epochs     = args.epochs
+    p_val_size   = args.val_size
 
     ########################
     # 1. Get and prepare data
@@ -140,10 +153,16 @@ def main():
         x_data_train.append(item[0])
 
     x_data_train = np.array(x_data_train)
-    print("End of loading data..")
 
-    print(x_data_train.shape)
-    print(x_data_train[0])
+    x_data_test = []
+    for item in x_dataset_test.values:
+        #print("Item is here", item)
+        x_data_test.append(item[0])
+
+    x_data_test = np.array(x_data_test)
+
+
+    print("End of loading data..")
 
     #######################
     # 2. Getting model
@@ -152,9 +171,9 @@ def main():
     model = generate_model(input_shape)
     model.summary()
 
-    model.fit(x_data_train, y_dataset_train.values, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
+    model.fit(x_data_train, y_dataset_train.values, validation_split=p_val_size, epochs=p_epochs, batch_size=p_batch_size)
 
-    score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=cfg.keras_batch)
+    score = model.evaluate(x_data_test, y_dataset_test, batch_size=p_batch_size)
 
     if not os.path.exists(cfg.saved_models_folder):
         os.makedirs(cfg.saved_models_folder)
@@ -169,11 +188,38 @@ def main():
 
     model.save_weights(model_output_path.replace('.json', '.h5'))
 
-    # Save results obtained from model
-    y_test_prediction = model.predict(x_dataset_test)
-    print("Metrics : ", model.metrics_names)
-    print("Prediction : ", score)
-    print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction))
+    # Get results obtained from model
+    y_train_prediction = model.predict(x_data_test)
+    y_test_prediction = model.predict(x_data_test)
+
+    acc_train_score = accuracy_score(y_dataset_train, y_train_prediction)
+    acc_test_score = accuracy_score(y_dataset_test, y_test_prediction)
+
+    f1_train_score = f1_score(y_dataset_train, y_train_prediction)
+    f1_test_score = f1_score(y_dataset_test, y_test_prediction)
+
+    recall_train_score = recall_score(y_dataset_train, y_train_prediction)
+    recall_test_score = recall_score(y_dataset_test, y_test_prediction)
+
+    pres_train_score = precision_score(y_dataset_train, y_train_prediction)
+    pres_test_score = precision_score(y_dataset_test, y_test_prediction)
+
+    roc_train_score = roc_auc_score(y_dataset_test, y_train_prediction)
+    roc_test_score = roc_auc_score(y_dataset_test, y_test_prediction)
+
+    # save model performance
+    if not os.path.exists(cfg.models_information_folder):
+        os.makedirs(cfg.models_information_folder)
+
+    perf_file_path = os.path.join(cfg.models_information_folder, cfg.csv_model_comparisons_filename)
+
+    with open(perf_file_path, 'a') as f:
+        line = p_output + ';' + len(dataset_train) + ';' + len(dataset_test) + ';' + final_df_train_size + ';' + final_df_test_size + ';' + acc_train_score + ';' + acc_test_score + ';' \
+                        + f1_train_score + ';' + f1_test_score + ';' \
+                        + recall_train_score + ';' + recall_test_score + ';' \
+                        + pres_train_score + ';' + pres_test_score + ';' \
+                        + roc_train_score + ';' + roc_test_score + '\n'
+        f.write(line)
 
 if __name__== "__main__":
     main()