Browse Source

- add models backups for LSTM
- Give more information when loading data to user

Jérôme BUISINE 3 months ago
parent
commit
70f5d74832
4 changed files with 190 additions and 107 deletions
  1. 16 33
      cnn_models.py
  2. 1 1
      generate/generate_dataset.py
  3. 85 11
      train_lstm_weighted.py
  4. 88 62
      train_model.py

+ 16 - 33
cnn_models.py

@@ -17,19 +17,19 @@ import custom_config as cfg
 #from models import metrics
 
 
-def generate_model_2D(_input_shape, _weights_file=None):
+def generate_model_2D(_input_shape):
 
     model = Sequential()
 
-    model.add(Conv2D(60, (2, 2), input_shape=_input_shape))
+    model.add(Conv2D(140, (3, 3), input_shape=_input_shape))
     model.add(Activation('relu'))
     model.add(MaxPooling2D(pool_size=(2, 2)))
 
-    model.add(Conv2D(40, (2, 2)))
+    model.add(Conv2D(70, (3, 3)))
     model.add(Activation('relu'))
     model.add(MaxPooling2D(pool_size=(2, 2)))
 
-    model.add(Conv2D(20, (2, 2)))
+    model.add(Conv2D(20, (3, 3)))
     model.add(Activation('relu'))
     model.add(MaxPooling2D(pool_size=(2, 2)))
 
@@ -63,10 +63,6 @@ def generate_model_2D(_input_shape, _weights_file=None):
     model.add(Dense(2))
     model.add(Activation('softmax'))
 
-    # reload weights if exists
-    if _weights_file is not None:
-        model.load_weights(_weights_file)
-
     model.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   #metrics=['accuracy', metrics.auc])
@@ -75,42 +71,37 @@ def generate_model_2D(_input_shape, _weights_file=None):
     return model
 
 
-def generate_model_3D(_input_shape, _weights_file=None):
+def generate_model_3D(_input_shape):
 
     model = Sequential()
 
     print(_input_shape)
 
-    model.add(Conv3D(60, (1, 2, 2), input_shape=_input_shape))
+    model.add(Conv3D(200, (1, 3, 3), input_shape=_input_shape))
     model.add(Activation('relu'))
     model.add(MaxPooling3D(pool_size=(1, 2, 2)))
 
-    model.add(Conv3D(40, (1, 2, 2)))
+    model.add(Conv3D(100, (1, 3, 3)))
     model.add(Activation('relu'))
     model.add(MaxPooling3D(pool_size=(1, 2, 2)))
 
-    model.add(Conv3D(20, (1, 2, 2)))
+    model.add(Conv3D(40, (1, 3, 3)))
     model.add(Activation('relu'))
     model.add(MaxPooling3D(pool_size=(1, 2, 2)))
 
     model.add(Flatten())
 
-    model.add(Dense(140))
-    model.add(Activation('relu'))
-    model.add(BatchNormalization())
-    model.add(Dropout(0.5))
-
-    model.add(Dense(120))
+    model.add(Dense(256))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
     model.add(Dropout(0.5))
 
-    model.add(Dense(80))
+    model.add(Dense(128))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
     model.add(Dropout(0.5))
 
-    model.add(Dense(40))
+    model.add(Dense(64))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
     model.add(Dropout(0.5))
@@ -123,10 +114,6 @@ def generate_model_3D(_input_shape, _weights_file=None):
     model.add(Dense(2))
     model.add(Activation('sigmoid'))
 
-    # reload weights if exists
-    if _weights_file is not None:
-        model.load_weights(_weights_file)
-
     model.compile(loss='categorical_crossentropy',
                   optimizer='rmsprop',
                   #metrics=['accuracy', metrics.auc])
@@ -136,7 +123,7 @@ def generate_model_3D(_input_shape, _weights_file=None):
 
 
 # using transfer learning (VGG19)
-def generate_model_3D_TL(_input_shape, _weights_file=None):
+def generate_model_3D_TL(_input_shape):
 
     # load pre-trained model
     model = VGG19(weights='imagenet', include_top=False, input_shape=_input_shape)
@@ -199,10 +186,6 @@ def generate_model_3D_TL(_input_shape, _weights_file=None):
 
     model_final.summary()
 
-    # reload weights if exists
-    if _weights_file is not None:
-        model.load_weights(_weights_file)
-
     model_final.compile(loss='binary_crossentropy',
                   optimizer='rmsprop',
                 #   metrics=['accuracy', metrics.auc])
@@ -211,16 +194,16 @@ def generate_model_3D_TL(_input_shape, _weights_file=None):
     return model_final
 
 
-def get_model(n_channels, _input_shape, _tl=False, _weights_file=None):
+def get_model(n_channels, _input_shape, _tl=False):
     
     if _tl:
         if n_channels == 3:
-            return generate_model_3D_TL(_input_shape, _weights_file)
+            return generate_model_3D_TL(_input_shape)
         else:
             print("Can't use transfer learning with only 1 channel")
 
     if n_channels == 1:
-        return generate_model_2D(_input_shape, _weights_file)
+        return generate_model_2D(_input_shape)
 
     if n_channels >= 2:
-        return generate_model_3D(_input_shape, _weights_file)
+        return generate_model_3D(_input_shape)

+ 1 - 1
generate/generate_dataset.py

@@ -42,7 +42,7 @@ generic_output_file_svd = '_random.csv'
 def generate_data_model(_filename, _transformations, _scenes_list, _nb_zones = 4, _random=0):
 
     output_train_filename = _filename + ".train"
-    output_test_filename = _filename + ".val"
+    output_test_filename = _filename + ".test"
 
     if not '/' in output_train_filename:
         raise Exception("Please select filename with directory path to save data. Example : data/dataset")

+ 85 - 11
train_lstm_weighted.py

@@ -1,5 +1,5 @@
 # main imports
-import argparse
+import argparse, sys
 import numpy as np
 import pandas as pd
 import os
@@ -14,6 +14,8 @@ from ipfml import utils
 from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization, ConvLSTM2D, Conv3D, Flatten
 from keras.preprocessing.sequence import pad_sequences
 from keras.models import Sequential
+from keras.models import load_model
+from keras.callbacks import ModelCheckpoint
 from sklearn.metrics import roc_auc_score, accuracy_score
 import tensorflow as tf
 from keras import backend as K
@@ -23,6 +25,30 @@ from joblib import dump
 
 import custom_config as cfg
 
+# global variables
+n_counter = 0
+total_samples = 0
+
+def write_progress(progress):
+    '''
+    Display progress information as progress bar
+    '''
+    barWidth = 180
+
+    output_str = "["
+    pos = barWidth * progress
+    for i in range(barWidth):
+        if i < pos:
+           output_str = output_str + "="
+        elif i == pos:
+           output_str = output_str + ">"
+        else:
+            output_str = output_str + " "
+
+    output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
+    print(output_str)
+    sys.stdout.write("\033[F")
+
 
 def build_input(df, seq_norm):
     """Convert dataframe to numpy array input with timesteps as float array
@@ -35,6 +61,8 @@ def build_input(df, seq_norm):
         {np.ndarray} -- input LSTM data as numpy array
     """
 
+    global n_counter
+    global total_samples
     arr = []
 
     # for each input line
@@ -58,6 +86,10 @@ def build_input(df, seq_norm):
             
         arr.append(seq_arr)
 
+        # update progress
+        n_counter += 1
+        write_progress(n_counter / float(total_samples))
+
     arr = np.array(arr)
     print(arr.shape)
 
@@ -129,16 +161,16 @@ def create_model(_input_shape):
     model.add(Dense(1, activation='sigmoid'))
     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 
-    print ('Compiling...')
-    # model.compile(loss='binary_crossentropy',
-    #               optimizer='rmsprop',
-    #               metrics=['accuracy'])
+    print ('-- Compiling...')
 
     return model
 
 
 def main():
 
+    # get this variable as global
+    global total_samples
+
     parser = argparse.ArgumentParser(description="Read and compute training of LSTM model")
 
     parser.add_argument('--train', type=str, help='input train dataset', required=True)
@@ -157,9 +189,14 @@ def main():
     p_batch_size   = args.batch_size
     p_seq_norm     = bool(args.seq_norm)
 
+    print('-----------------------------')
+    print("----- Preparing data... -----")
     dataset_train = pd.read_csv(p_train, header=None, sep=';')
     dataset_test = pd.read_csv(p_test, header=None, sep=';')
 
+    print("-- Train set size : ", len(dataset_train))
+    print("-- Test set size : ", len(dataset_test))
+
     # getting weighted class over the whole dataset
     noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
     not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
@@ -176,9 +213,12 @@ def main():
 
     total_samples = noisy_samples + not_noisy_samples
 
-    print('noisy', noisy_samples)
-    print('not_noisy', not_noisy_samples)
-    print('total', total_samples)
+    print('-----------------------------')
+    print('---- Dataset information ----')
+    print('-- noisy:', noisy_samples)
+    print('-- not_noisy:', not_noisy_samples)
+    print('-- total:', total_samples)
+    print('-----------------------------')
 
     class_weight = {
         0: noisy_samples / float(total_samples),
@@ -189,6 +229,9 @@ def main():
     final_df_train = sklearn.utils.shuffle(dataset_train)
     final_df_test = sklearn.utils.shuffle(dataset_test)
 
+    print('---- Loading dataset.... ----')
+    print('-----------------------------\n')
+
     # split dataset into X_train, y_train, X_test, y_test
     X_train_all = final_df_train.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
     X_train_all = build_input(X_train_all, p_seq_norm)
@@ -199,14 +242,45 @@ def main():
     y_test = final_df_test.loc[:, 0].astype('int')
 
     input_shape = (X_train_all.shape[1], X_train_all.shape[2], X_train_all.shape[3], X_train_all.shape[4])
-    print('Training data input shape', input_shape)
-    model = create_model(input_shape)
+    
+    
+    print('\n-----------------------------')
+    print('-- Training data input shape', input_shape)
+    print('-----------------------------')
+
+    # create backup folder for current model
+    model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
+    if not os.path.exists(model_backup_folder):
+        os.makedirs(model_backup_folder)
+
+    # add of callback models
+    filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-_{epoch:03d}.h5")
+    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=0, mode='max')
+    callbacks_list = [checkpoint]
+
+    
+    # check if backup already exists
+    backups = sorted(os.listdir(model_backup_folder))
+
+    if len(backups) > 0:
+        last_backup_file = backups[-1]
+        model = load_model(last_backup_file)
+
+        # get initial epoch
+        initial_epoch = int(last_backup_file.split('_')[-1].replace('.h5', ''))
+        print('-----------------------------')  
+        print('-- Restore model from backup...')
+        print('-- Restart training @epoch:', initial_epoch)
+        print('-----------------------------')
+    else:
+        model = create_model(input_shape)
     model.summary()
 
     # prepare train and validation dataset
     X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, shuffle=False)
 
-    print("Fitting model with custom class_weight", class_weight)
+    print("-- Fitting model with custom class_weight", class_weight)
+    print('-----------------------------')
     history = model.fit(X_train, y_train, batch_size=p_batch_size, epochs=p_epochs, validation_data=(X_val, y_val), verbose=1, shuffle=True, class_weight=class_weight)
 
     # list all data in history

+ 88 - 62
train_model.py

@@ -8,6 +8,7 @@ import json
 import cnn_models as models
 import tensorflow as tf
 import keras
+from keras.models import load_model
 from keras import backend as K
 from keras.callbacks import ModelCheckpoint
 from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
@@ -23,12 +24,34 @@ sys.path.insert(0, '') # trick to enable import of main folder module
 
 import custom_config as cfg
 
+# counter param
+n_counter = 0
+
+def write_progress(progress):
+    '''
+    Display progress information as progress bar
+    '''
+    barWidth = 180
+
+    output_str = "["
+    pos = barWidth * progress
+    for i in range(barWidth):
+        if i < pos:
+           output_str = output_str + "="
+        elif i == pos:
+           output_str = output_str + ">"
+        else:
+            output_str = output_str + " "
+
+    output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
+    print(output_str)
+    sys.stdout.write("\033[F")
 
 def main():
 
     parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
 
-    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .val)', required=True)
+    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
     parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
     parser.add_argument('--tl', type=int, help='use or not of transfer learning (`VGG network`)', default=0, choices=[0, 1])
     parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=64)
@@ -55,18 +78,20 @@ def main():
     ########################
     # 1. Get and prepare data
     ########################
-    print("Preparing data...")
+    print('-----------------------------')
+    print("----- Preparing data... -----")
     dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
     dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
 
-    print("Train set size : ", len(dataset_train))
-    print("Test set size : ", len(dataset_test))
+    print("-- Train set size : ", len(dataset_train))
+    print("-- Test set size : ", len(dataset_test))
 
     # default first shuffle of data
     dataset_train = shuffle(dataset_train)
     dataset_test = shuffle(dataset_test)
 
-    print("Reading all images data...")
+    print('-----------------------------')
+    print("--Reading all images data...")
 
     # getting number of chanel
     if p_chanels == 0:
@@ -74,7 +99,7 @@ def main():
     else:
         n_chanels = p_chanels
 
-    print("Number of chanels : ", n_chanels)
+    print("-- Number of chanels : ", n_chanels)
     img_width, img_height = [ int(s) for s in p_size ]
 
     # specify the number of dimensions
@@ -106,32 +131,58 @@ def main():
 
     total_samples = noisy_samples + not_noisy_samples
 
-    print('noisy', noisy_samples)
-    print('not_noisy', not_noisy_samples)
-    print('total', total_samples)
+    print('-----------------------------')
+    print('---- Dataset information ----')
+    print('-- noisy:', noisy_samples)
+    print('-- not_noisy:', not_noisy_samples)
+    print('-- total:', total_samples)
+    print('-----------------------------')
 
     class_weight = {
         0: (noisy_samples / float(total_samples)),
         1: (not_noisy_samples / float(total_samples)),
     }
 
-
-
     final_df_train = dataset_train
     final_df_test = dataset_test
+    
+    def load_multiple_greyscale(x):
+        # update progress
+        global n_counter
+        n_counter += 1
+        write_progress(n_counter / float(total_samples))
+        return [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')]
+
+    def load_greyscale(x):
+        # update progress
+        global n_counter
+        n_counter += 1
+        write_progress(n_counter / float(total_samples))
+        return cv2.imread(x, cv2.IMREAD_GRAYSCALE)
+
+    def load_rgb(x):
+        # update progress
+        global n_counter
+        n_counter += 1
+        write_progress(n_counter / float(total_samples))
+        return cv2.imread(x)
+
+
+    print('---- Loading dataset.... ----')
+    print('-----------------------------\n')
 
     # check if specific number of chanels is used
     if p_chanels == 0:
         # `::` is the separator used for getting each img path
         if n_chanels > 1:
-            final_df_train[1] = final_df_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
-            final_df_test[1] = final_df_test[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
+            final_df_train[1] = final_df_train[1].apply(lambda x: load_multiple_greyscale(x))
+            final_df_test[1] = final_df_test[1].apply(lambda x: load_multiple_greyscale(x))
         else:
-            final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
-            final_df_test[1] = final_df_test[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
+            final_df_train[1] = final_df_train[1].apply(lambda x: load_greyscale(x))
+            final_df_test[1] = final_df_test[1].apply(lambda x: load_greyscale(x))
     else:
-        final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x))
-        final_df_test[1] = final_df_test[1].apply(lambda x: cv2.imread(x))
+        final_df_train[1] = final_df_train[1].apply(lambda x: load_rgb(x))
+        final_df_test[1] = final_df_test[1].apply(lambda x: load_rgb(x))
 
     # reshape array data
     final_df_train[1] = final_df_train[1].apply(lambda x: np.array(x).reshape(input_shape))
@@ -141,12 +192,9 @@ def main():
     final_df_train = shuffle(final_df_train)
     final_df_test = shuffle(final_df_test)
 
-    final_df_train_size = len(final_df_train.index)
-    final_df_test_size = len(final_df_test.index)
-
-    print("----------------------------------------------------------")
+    print('\n-----------------------------')
     print("Validation split is now set at", p_val_size)
-    print("----------------------------------------------------------")
+    print('-----------------------------')
 
     # use of the whole data set for training
     x_dataset_train = final_df_train.iloc[:,1:]
@@ -169,10 +217,6 @@ def main():
 
     x_data_test = np.array(x_data_test)
 
-    print("End of loading data..")
-
-    print("Train set size (after balancing) : ", final_df_train_size)
-    print("Test set size (after balancing) : ", final_df_test_size)
 
     #######################
     # 2. Getting model
@@ -184,44 +228,27 @@ def main():
         os.makedirs(model_backup_folder)
 
     # add of callback models
-    filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-{accuracy:02f}-{val_accuracy:02f}__{epoch:02d}.hdf5")
-    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
+    filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-_{epoch:03d}.h5")
+    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=0, mode='max')
     callbacks_list = [checkpoint]
 
     
     # check if backup already exists
-    weights_filepath = None
     backups = sorted(os.listdir(model_backup_folder))
 
     if len(backups) > 0:
-
-        # retrieve last backup epoch of model 
-        last_model_backup = None
-        max_last_epoch = 0
-
-        for backup in backups:
-
-            last_epoch = int(backup.split('__')[1].replace('.h5', ''))
-
-            if last_epoch > max_last_epoch and last_epoch < p_epochs:
-                max_last_epoch = last_epoch
-                last_model_backup = backup
-
-        if last_model_backup is None:
-            print("Epochs asked is already computer. Noee")
-            sys.exit(1)
-
-        initial_epoch = max_last_epoch
-        print("-------------------------------------------------")
-        print("Previous backup model found",  last_model_backup, "with already", initial_epoch, " epoch(s) done...")
-        print("Resuming from epoch", str(initial_epoch + 1))
-        print("-------------------------------------------------")
-
-        # load weights
-        weights_filepath = os.path.join(model_backup_folder, last_model_backup)
-
-    print(n_chanels)
-    model = models.get_model(n_chanels, input_shape, p_tl, weights_filepath)
+        last_backup_file = backups[-1]
+        model = load_model(last_backup_file)
+
+        # get initial epoch
+        initial_epoch = int(last_backup_file.split('_')[-1].replace('.h5', ''))
+        print('-----------------------------')  
+        print('-- Restore model from backup...')
+        print('-- Restart training @epoch:', initial_epoch)
+        print('-----------------------------')
+    else:
+        model = models.get_model(n_chanels, input_shape, p_tl)
+        
     model.summary()
 
     # prepare train and validation dataset
@@ -231,7 +258,9 @@ def main():
     y_val = to_categorical(y_val)
     y_test = to_categorical(y_dataset_test)
 
-    print("Fitting model with custom class_weight", class_weight)
+    print('-----------------------------')
+    print("-- Fitting model with custom class_weight", class_weight)
+    print('-----------------------------')
     model.fit(X_train, y_train, 
         validation_data=(X_val, y_val), 
         initial_epoch=initial_epoch, 
@@ -247,7 +276,7 @@ def main():
     if not os.path.exists(cfg.output_models):
         os.makedirs(cfg.output_models)
 
-    # save the model into HDF5 file
+    # save the model into H5 file
     model_output_path = os.path.join(cfg.output_models, p_output + '.h5')
     model.save(model_output_path)
 
@@ -256,9 +285,6 @@ def main():
     y_val_prediction = model.predict(X_val)
     y_test_prediction = model.predict(x_dataset_test)
 
-    # y_train_prediction = [1 if x > 0.5 else 0 for x in y_train_prediction]
-    # y_val_prediction = [1 if x > 0.5 else 0 for x in y_val_prediction]
-
     y_train_prediction = np.argmax(y_train_prediction, axis=1)
     y_val_prediction = np.argmax(y_val_prediction, axis=1)