Browse Source

Update of modules and training data

Jérôme BUISINE 1 year ago
parent
commit
f923d55c3d
5 changed files with 81 additions and 84 deletions
  1. 21 8
      cnn_models.py
  2. 0 40
      generate_symlinks.sh
  3. 1 1
      modules
  4. 3 3
      prediction_model.py
  5. 56 32
      train_model.py

+ 21 - 8
cnn_models.py

@@ -16,7 +16,8 @@ sys.path.insert(0, '') # trick to enable import of main folder module
 import custom_config as cfg
 from models import metrics
 
-def generate_model_2D(_input_shape):
+
+def generate_model_2D(_input_shape, _weights_file=None):
 
     model = Sequential()
 
@@ -62,6 +63,10 @@ def generate_model_2D(_input_shape):
     model.add(Dense(1))
     model.add(Activation('sigmoid'))
 
+    # reload weights if exists
+    if _weights_file is not None:
+        model.load_weights(_weights_file)
+
     model.compile(loss='binary_crossentropy',
                   optimizer='rmsprop',
                   metrics=['accuracy', metrics.auc])
@@ -69,7 +74,7 @@ def generate_model_2D(_input_shape):
     return model
 
 
-def generate_model_3D(_input_shape):
+def generate_model_3D(_input_shape, _weights_file=None):
 
     model = Sequential()
 
@@ -117,6 +122,10 @@ def generate_model_3D(_input_shape):
     model.add(Dense(1))
     model.add(Activation('sigmoid'))
 
+    # reload weights if exists
+    if _weights_file is not None:
+        model.load_weights(_weights_file)
+
     model.compile(loss='binary_crossentropy',
                   optimizer='rmsprop',
                   metrics=['accuracy', metrics.auc])
@@ -125,7 +134,7 @@ def generate_model_3D(_input_shape):
 
 
 # using transfer learning (VGG19)
-def generate_model_3D_TL(_input_shape):
+def generate_model_3D_TL(_input_shape, _weights_file=None):
 
     # load pre-trained model
     model = VGG19(weights='imagenet', include_top=False, input_shape=_input_shape)
@@ -188,6 +197,10 @@ def generate_model_3D_TL(_input_shape):
 
     model_final.summary()
 
+    # reload weights if exists
+    if _weights_file is not None:
+        model.load_weights(_weights_file)
+
     model_final.compile(loss='binary_crossentropy',
                   optimizer='rmsprop',
                   metrics=['accuracy', metrics.auc])
@@ -195,16 +208,16 @@ def generate_model_3D_TL(_input_shape):
     return model_final
 
 
-def get_model(n_channels, _input_shape, tl=False):
+def get_model(n_channels, _input_shape, _tl=False, _weights_file=None):
     
-    if tl:
+    if _tl:
         if n_channels == 3:
-            return generate_model_3D_TL(_input_shape)
+            return generate_model_3D_TL(_input_shape, _weights_file)
         else:
             print("Can't use transfer learning with only 1 channel")
 
     if n_channels == 1:
-        return generate_model_2D(_input_shape)
+        return generate_model_2D(_input_shape, _weights_file)
 
     if n_channels == 3:
-        return generate_model_3D(_input_shape)
+        return generate_model_3D(_input_shape, _weights_file)

+ 0 - 40
generate_symlinks.sh

@@ -1,40 +0,0 @@
-#! /bin/bash
-
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need to specify orval you want to use (in /scratch folder)"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need to specify where you want to store data"
-    exit 1
-fi
-
-
-echo "Creating links into /scratch folder"
-
-scratch="/scratch"
-orval=$1
-path=$2
-
-
-for link in {"data","results","saved_models","models_info","models_backup","threshold_map","learned_zones","custom_norm"}; do
-    
-    if [ -L ${link} ]; then
-        rm ${link}
-    fi
-    
-    fullpath=${scratch}/${orval}/${path}/${link}
-
-    if [ ! -d "${fullpath}" ]; then
-        mkdir -p ${fullpath}
-    fi
-    
-    # remove `orval` name for running part
-    ln -s ${scratch}/${path}/${link} ${link}
-done

+ 1 - 1
modules

@@ -1 +1 @@
-Subproject commit 283a56becc8ca9103ac3d9f1cdaa6f9daa2dcb5c
+Subproject commit 5f7d01f3c5eaca0abf3950663ed7e8db9f92634b

+ 3 - 3
prediction_model.py

@@ -77,8 +77,8 @@ def main():
     dataset[1] = dataset[1].apply(lambda x: np.array(x).reshape(input_shape))
 
     # use of the whole data set for training
-    x_dataset = dataset.ix[:,1:]
-    y_dataset = dataset.ix[:,0]
+    x_dataset = dataset.iloc[:,1:]
+    y_dataset = dataset.iloc[:,0]
 
     x_data = []
     for item in x_dataset.values:
@@ -125,7 +125,7 @@ def main():
 
     # add information into file
     with open(perf_file_path, 'a') as f:
-        line = p_data_file + ';' + p_model_file + ';' + str(acc_score) + ';' + str(f1_data_score) + ';' + str(recall_data_score) + ';' + str(pres_score) + ';' + str(roc_score)
+        line = p_data_file + ';' + str(len(dataset)) + ';' + p_model_file + ';' + str(acc_score) + ';' + str(f1_data_score) + ';' + str(recall_data_score) + ';' + str(pres_score) + ';' + str(roc_score) + ';\n'
         f.write(line)
 
 if __name__== "__main__":

+ 56 - 32
train_model.py

@@ -31,6 +31,7 @@ def main():
     parser.add_argument('--tl', type=int, help='use or not of transfer learning (`VGG network`)', default=0, choices=[0, 1])
     parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
     parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
+    parser.add_argument('--balancing', type=int, help='specify if balacing of classes is done or not', default="1")
     #parser.add_argument('--val_size', type=float, help='percent of validation data during training process', default=cfg.val_dataset_size)
 
 
@@ -41,6 +42,8 @@ def main():
     p_tl          = args.tl
     p_batch_size  = args.batch_size
     p_epochs      = args.epochs
+    p_balancing   = bool(args.balancing)
+
     #p_val_size    = args.val_size
     initial_epoch = 0
         
@@ -79,29 +82,35 @@ def main():
         else:
             input_shape = (img_width, img_height, n_channels)
 
-    # `:` is the separator used for getting each img path
-    if n_channels > 1:
-        dataset_train[1] = dataset_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
-        dataset_val[1] = dataset_val[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
-    else:
-        dataset_train[1] = dataset_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
-        dataset_val[1] = dataset_val[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
+    # get dataset with equal number of classes occurences if wished
+    if p_balancing:
+        print("Balancing of data")
+        noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
+        not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
+        nb_noisy_train = len(noisy_df_train.index)
 
-    # reshape array data
-    dataset_train[1] = dataset_train[1].apply(lambda x: np.array(x).reshape(input_shape))
-    dataset_val[1] = dataset_val[1].apply(lambda x: np.array(x).reshape(input_shape))
+        noisy_df_val = dataset_val[dataset_val.iloc[:, 0] == 1]
+        not_noisy_df_val = dataset_val[dataset_val.iloc[:, 0] == 0]
+        nb_noisy_val = len(noisy_df_val.index)
 
-    # get dataset with equal number of classes occurences
-    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
-    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
-    nb_noisy_train = len(noisy_df_train.index)
+        final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
+        final_df_val = pd.concat([not_noisy_df_val[0:nb_noisy_val], noisy_df_val])
+    else:
+        print("No balancing of data")
+        final_df_train = dataset_train
+        final_df_val = dataset_val
 
-    noisy_df_val = dataset_val[dataset_val.ix[:, 0] == 1]
-    not_noisy_df_val = dataset_val[dataset_val.ix[:, 0] == 0]
-    nb_noisy_val = len(noisy_df_val.index)
+        # `:` is the separator used for getting each img path
+    if n_channels > 1:
+        final_df_train[1] = final_df_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
+        final_df_val[1] = final_df_val[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
+    else:
+        final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
+        final_df_val[1] = final_df_val[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_val = pd.concat([not_noisy_df_val[0:nb_noisy_val], noisy_df_val])
+    # reshape array data
+    final_df_train[1] = final_df_train[1].apply(lambda x: np.array(x).reshape(input_shape))
+    final_df_val[1] = final_df_val[1].apply(lambda x: np.array(x).reshape(input_shape))
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)
@@ -117,11 +126,11 @@ def main():
     print("----------------------------------------------------------")
 
     # use of the whole data set for training
-    x_dataset_train = final_df_train.ix[:,1:]
-    x_dataset_val = final_df_val.ix[:,1:]
+    x_dataset_train = final_df_train.iloc[:,1:]
+    x_dataset_val = final_df_val.iloc[:,1:]
 
-    y_dataset_train = final_df_train.ix[:,0]
-    y_dataset_val = final_df_val.ix[:,0]
+    y_dataset_train = final_df_train.iloc[:,0]
+    y_dataset_val = final_df_val.iloc[:,0]
 
     x_data_train = []
     for item in x_dataset_train.values:
@@ -156,22 +165,37 @@ def main():
     checkpoint = ModelCheckpoint(filepath, monitor='val_auc', verbose=1, save_best_only=True, mode='max')
     callbacks_list = [checkpoint]
 
-    model = models.get_model(n_channels, input_shape, p_tl)
-    model.summary()
-
+    
     # check if backup already exists
+    weights_filepath = None
     backups = sorted(os.listdir(model_backup_folder))
 
     if len(backups) > 0:
-        # TODO : check of initial epoch
-        last_backup = backups[-1]
-        last_epoch = int(last_backup.split('__')[1].replace('.hdf5', ''))
-        initial_epoch = last_epoch
+
+        # retrieve last backup epoch of model 
+        last_model_backup = None
+        max_last_epoch = 0
+
+        for backup in backups:
+
+            last_epoch = int(backup.split('__')[1].replace('.hdf5', ''))
+
+            if last_epoch > max_last_epoch and last_epoch < p_epochs:
+                max_last_epoch = last_epoch
+                last_model_backup = backup
+
+        initial_epoch = max_last_epoch
         print("-------------------------------------------------")
-        print("Previous backup model found with already", last_epoch, "done...")
-        print("Resuming from epoch", str(last_epoch + 1))
+        print("Previous backup model found",  last_model_backup, "with already", initial_epoch, "done...")
+        print("Resuming from epoch", str(initial_epoch + 1))
         print("-------------------------------------------------")
 
+        # load weights
+        weights_filepath = os.path.join(model_backup_folder, last_model_backup)
+
+    model = models.get_model(n_channels, input_shape, p_tl, weights_filepath)
+    model.summary()
+
     # concatenate train and validation data (`validation_split` param will do the separation into keras model)
     y_data = np.concatenate([y_dataset_train.values, y_dataset_val.values])
     x_data = np.concatenate([x_data_train, x_data_val])