Parcourir la source

Update of project and script

Jérôme BUISINE il y a 4 ans
Parent
commit
19d5f63345
7 fichiers modifiés avec 95 ajouts et 142 suppressions
  1. 1 0
      .gitignore
  2. 6 20
      README.md
  3. 0 25
      TODO.md
  4. 0 75
      modules/models.py
  5. 3 2
      modules/utils/config.py
  6. 22 3
      run.sh
  7. 63 17
      train_model.py

+ 1 - 0
.gitignore

@@ -7,6 +7,7 @@ __pycache__
 # by default avoid model files and png files
 # by default avoid model files and png files
 *.h5
 *.h5
 *.png
 *.png
+saved_models
 !saved_models/*.h5
 !saved_models/*.h5
 !saved_models/*.png
 !saved_models/*.png
 
 

+ 6 - 20
README.md

@@ -8,37 +8,23 @@ pip install -r requirements.txt
 
 
 ## How to use
 ## How to use
 
 
-Generate dataset (run only once time or clean data folder before):
+Generate reconstructed data from specific method of reconstruction (run only once time or clean data folder before):
 ```
 ```
-python generate_dataset.py
+python generate_reconstructed_data.py -h
 ```
 ```
 
 
-It will split scenes and generate all data you need for your neural network.
-You can specify the number of sub images you want in the script by modifying **_NUMBER_SUB_IMAGES_** variable or using parameter.
-
+Generate custom dataset from one reconstructed method or multiples (implemented later)
 ```
 ```
-python generate_dataset.py --nb xxxx
+python generate_dataset.py -h
 ```
 ```
 
 
-There are 3 kinds of Neural Networks:
-- **classification_cnn_keras.py**: *based on cropped images and do convolution*
-- **classification_cnn_keras_cross_validation.py**: *based on cropped images and do convolution. Data are randomly split for training*
-- **classification_cnn_keras_svd.py**: *based on svd metrics of image*
-
-
-After your built your neural network in classification_cnn_keras.py, you just have to run it:
-
-```
-python classification_cnn_keras_svd.py --directory xxxx --output xxxxx --batch_size xx --epochs xx --img xx (or --image_width xx --img_height xx)
-```
 
 
-A config file in json is available and keeps in memory all image sizes available.
 
 
 ## Modules
 ## Modules
 
 
 This project contains modules:
 This project contains modules:
-- **modules/image_metrics**: *where all computed metrics function are developed*
-- **modules/model_helper**: *contains helpful function to save or display model information and performance*
+- **modules/utils/config.py**: *Store all configuration information about the project and dataset information*
+- **modules/utils/data.py**: *Usefull methods used for data generation*
 
 
 All these modules will be enhanced during development of the project
 All these modules will be enhanced during development of the project
 
 

+ 0 - 25
TODO.md

@@ -1,25 +0,0 @@
-# TODO :
-
-## Prépation des données
-
-- Séparer dans 2 dossiers les images (noisy, not noisy) 
-  - Par scène
-  - Par zone
-  - Par métrique [scene, zone]
-  
-- Transformer chaque image comme souhaitée (ici reconstruction SV avec 110 composantes faibles)
-- Pour chaque image ajouter sa forme sous 4 rotations (augmentation du nombre de données)
-
-## Chargement des données
-- Chargement de l'ensemble des images (association : "path", "label")
-- Mettre en place un équilibre de classes
-- Mélange de l'ensemble des données
-- Séparation des données (train, validation, test)
-
-## Conception du modèle
-- Mise en place d'un modèle CNN
-- Utilisation BatchNormalization / Dropout
-
-
-## Si non fonctionnel
-- Utilisation d'une approche transfer learning

+ 0 - 75
modules/models.py

@@ -1,75 +0,0 @@
-from sklearn.model_selection import GridSearchCV
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier, VotingClassifier
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-import sklearn.svm as svm
-
-
-def _get_best_model(X_train, y_train):
-
-    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
-    param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
-
-    svc = svm.SVC(probability=True)
-    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
-
-    clf.fit(X_train, y_train)
-
-    model = clf.best_estimator_
-
-    return model
-
-def svm_model(X_train, y_train):
-
-    return _get_best_model(X_train, y_train)
-
-
-def ensemble_model(X_train, y_train):
-
-    svm_model = _get_best_model(X_train, y_train)
-
-    lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
-    rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
-
-    ensemble_model = VotingClassifier(estimators=[
-       ('svm', svm_model), ('lr', lr_model), ('rf', rf_model)], voting='soft', weights=[1,1,1])
-
-    ensemble_model.fit(X_train, y_train)
-
-    return ensemble_model
-
-
-def ensemble_model_v2(X_train, y_train):
-
-    svm_model = _get_best_model(X_train, y_train)
-    knc_model = KNeighborsClassifier(n_neighbors=2)
-    gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
-    lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
-    rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
-
-    ensemble_model = VotingClassifier(estimators=[
-       ('lr', lr_model),
-       ('knc', knc_model),
-       ('gbc', gbc_model),
-       ('svm', svm_model),
-       ('rf', rf_model)],
-       voting='soft', weights=[1, 1, 1, 1, 1])
-
-    ensemble_model.fit(X_train, y_train)
-
-    return ensemble_model
-
-def get_trained_model(choice, X_train, y_train):
-
-    if choice == 'svm_model':
-        return svm_model(X_train, y_train)
-
-    if choice == 'ensemble_model':
-        return ensemble_model(X_train, y_train)
-
-    if choice == 'ensemble_model_v2':
-        return ensemble_model_v2(X_train, y_train)
-
-

+ 3 - 2
modules/utils/config.py

@@ -40,5 +40,6 @@ zones_indices                   = np.arange(16)
 
 
 metric_choices_labels           = ['all', 'svd_reconstruction']
 metric_choices_labels           = ['all', 'svd_reconstruction']
 
 
-keras_epochs                    = 500
-keras_batch                     = 32
+keras_epochs                    = 100
+keras_batch                     = 32
+val_dataset_size                = 0.2

+ 22 - 3
run.sh

@@ -1,7 +1,22 @@
 #!/bin/bash
 #!/bin/bash
 
 
+erased=$1
+
+if [ "${erased}" == "Y" ]; then
+    echo "Previous data file erased..."
+    rm ${file_path}
+    mkdir -p models_info
+    touch ${file_path}
+
+    # add of header
+    echo 'model_name; global_train_size; global_test_size; filtered_train_size; filtered_test_size; f1_train; f1_test; recall_train; recall_test; presicion_train; precision_test; acc_train; acc_test; roc_auc_train; roc_auc_test;' >> ${file_path}
+fi
+
 metric="svd_reconstruction"
 metric="svd_reconstruction"
 
 
+# file which contains model names we want to use for simulation
+comparisons_models="models_info/models_comparisons.csv"
+
 for begin in {80,85,90,95,100,105,110}; do
 for begin in {80,85,90,95,100,105,110}; do
   for end in {150,160,170,180,190,200}; do
   for end in {150,160,170,180,190,200}; do
 
 
@@ -10,9 +25,13 @@ for begin in {80,85,90,95,100,105,110}; do
     for zone in {6,8,10,12}; do
     for zone in {6,8,10,12}; do
       OUTPUT_DATA_FILE="${metric}_nb_zones_${zone}_B${begin}_E${end}"
       OUTPUT_DATA_FILE="${metric}_nb_zones_${zone}_B${begin}_E${end}"
 
 
-      python generate_dataset.py --output data/${OUTPUT_DATA_FILE} --metric ${metric} --renderer "maxwell" --scenes "A, D, G, H" --interval "${begin}, ${end}" --nb_zones ${zone} --random 1
-      
-      python train_model.py --data data/${OUTPUT_DATA_FILE} --output ${OUTPUT_DATA_FILE}
+      if grep -xq "${OUTPUT_DATA_FILE}" "${comparisons_models}"; then
+        
+        echo "Run simulation for model ${OUTPUT_DATA_FILE}"
+
+        python generate_dataset.py --output data/${OUTPUT_DATA_FILE} --metric ${metric} --renderer "maxwell" --scenes "A, D, G, H" --interval "${begin}, ${end}" --nb_zones ${zone} --random 1
+        
+        python train_model.py --data data/${OUTPUT_DATA_FILE} --output ${OUTPUT_DATA_FILE}
     done
     done
   done
   done
 done
 done

+ 63 - 17
train_model.py

@@ -12,10 +12,12 @@ from keras.models import Sequential
 from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
 from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
 from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
 from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
 from keras import backend as K
 from keras import backend as K
+import tensorflow as tf
+
 from keras.utils import plot_model
 from keras.utils import plot_model
 
 
 from modules.utils import config as cfg
 from modules.utils import config as cfg
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
 
 
 img_width, img_height = 200, 200
 img_width, img_height = 200, 200
 batch_size = 32
 batch_size = 32
@@ -26,6 +28,11 @@ if K.image_data_format() == 'channels_first':
 else:
 else:
     input_shape = (img_width, img_height, 1)
     input_shape = (img_width, img_height, 1)
 
 
+def auc(y_true, y_pred):
+    auc = tf.metrics.auc(y_true, y_pred)[1]
+    K.get_session().run(tf.local_variables_initializer())
+    #K.get_session().run(tf.local_variables_initializer())
+    return auc
 
 
 def generate_model(_input_shape):
 def generate_model(_input_shape):
 
 
@@ -74,8 +81,8 @@ def generate_model(_input_shape):
     model.add(Activation('sigmoid'))
     model.add(Activation('sigmoid'))
 
 
     model.compile(loss='binary_crossentropy',
     model.compile(loss='binary_crossentropy',
-                  optimizer='rmsprop',
-                  metrics=['accuracy'])
+                  optimizer='adam',
+                  metrics=['accuracy', auc])
 
 
     return model
     return model
 
 
@@ -84,13 +91,19 @@ def main():
 
 
     parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
     parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
 
 
-    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
-    parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)')
+    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
+    parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
+    parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
+    parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
+    parser.add_argument('--val_size', type=int, help='percent of validation data during training process', default=cfg.val_dataset_size)
 
 
     args = parser.parse_args()
     args = parser.parse_args()
 
 
-    p_data_file = args.data
-    p_output    = args.output
+    p_data_file  = args.data
+    p_output     = args.output
+    p_batch_size = args.batch_size
+    p_epochs     = args.epochs
+    p_val_size   = args.val_size
 
 
     ########################
     ########################
     # 1. Get and prepare data
     # 1. Get and prepare data
@@ -140,10 +153,16 @@ def main():
         x_data_train.append(item[0])
         x_data_train.append(item[0])
 
 
     x_data_train = np.array(x_data_train)
     x_data_train = np.array(x_data_train)
-    print("End of loading data..")
 
 
-    print(x_data_train.shape)
-    print(x_data_train[0])
+    x_data_test = []
+    for item in x_dataset_test.values:
+        #print("Item is here", item)
+        x_data_test.append(item[0])
+
+    x_data_test = np.array(x_data_test)
+
+
+    print("End of loading data..")
 
 
     #######################
     #######################
     # 2. Getting model
     # 2. Getting model
@@ -152,9 +171,9 @@ def main():
     model = generate_model(input_shape)
     model = generate_model(input_shape)
     model.summary()
     model.summary()
 
 
-    model.fit(x_data_train, y_dataset_train.values, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
+    model.fit(x_data_train, y_dataset_train.values, validation_split=p_val_size, epochs=p_epochs, batch_size=p_batch_size)
 
 
-    score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=cfg.keras_batch)
+    score = model.evaluate(x_data_test, y_dataset_test, batch_size=p_batch_size)
 
 
     if not os.path.exists(cfg.saved_models_folder):
     if not os.path.exists(cfg.saved_models_folder):
         os.makedirs(cfg.saved_models_folder)
         os.makedirs(cfg.saved_models_folder)
@@ -169,11 +188,38 @@ def main():
 
 
     model.save_weights(model_output_path.replace('.json', '.h5'))
     model.save_weights(model_output_path.replace('.json', '.h5'))
 
 
-    # Save results obtained from model
-    y_test_prediction = model.predict(x_dataset_test)
-    print("Metrics : ", model.metrics_names)
-    print("Prediction : ", score)
-    print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction))
+    # Get results obtained from model
+    y_train_prediction = model.predict(x_data_test)
+    y_test_prediction = model.predict(x_data_test)
+
+    acc_train_score = accuracy_score(y_dataset_train, y_train_prediction)
+    acc_test_score = accuracy_score(y_dataset_test, y_test_prediction)
+
+    f1_train_score = f1_score(y_dataset_train, y_train_prediction)
+    f1_test_score = f1_score(y_dataset_test, y_test_prediction)
+
+    recall_train_score = recall_score(y_dataset_train, y_train_prediction)
+    recall_test_score = recall_score(y_dataset_test, y_test_prediction)
+
+    pres_train_score = precision_score(y_dataset_train, y_train_prediction)
+    pres_test_score = precision_score(y_dataset_test, y_test_prediction)
+
+    roc_train_score = roc_auc_score(y_dataset_test, y_train_prediction)
+    roc_test_score = roc_auc_score(y_dataset_test, y_test_prediction)
+
+    # save model performance
+    if not os.path.exists(cfg.models_information_folder):
+        os.makedirs(cfg.models_information_folder)
+
+    perf_file_path = os.path.join(cfg.models_information_folder, cfg.csv_model_comparisons_filename)
+
+    with open(perf_file_path, 'a') as f:
+        line = p_output + ';' + len(dataset_train) + ';' + len(dataset_test) + ';' + final_df_train_size + ';' + final_df_test_size + ';' + acc_train_score + ';' + acc_test_score + ';' \
+                        + f1_train_score + ';' + f1_test_score + ';' \
+                        + recall_train_score + ';' + recall_test_score + ';' \
+                        + pres_train_score + ';' + pres_test_score + ';' \
+                        + roc_train_score + ';' + roc_test_score + '\n'
+        f.write(line)
 
 
 if __name__== "__main__":
 if __name__== "__main__":
     main()
     main()