train_model.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # main imports
  2. import numpy as np
  3. import pandas as pd
  4. import sys, os, argparse
  5. import json
  6. # model imports
  7. import cnn_models as models
  8. import tensorflow as tf
  9. import keras
  10. from keras import backend as K
  11. from keras.callbacks import ModelCheckpoint
  12. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
  13. from keras.utils import to_categorical
  14. # image processing imports
  15. import cv2
  16. from sklearn.utils import shuffle
  17. # config imports
  18. sys.path.insert(0, '') # trick to enable import of main folder module
  19. import custom_config as cfg
  20. def main():
  21. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  22. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .val)', required=True)
  23. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
  24. parser.add_argument('--tl', type=int, help='use or not of transfer learning (`VGG network`)', default=0, choices=[0, 1])
  25. parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
  26. parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
  27. parser.add_argument('--balancing', type=int, help='specify if balacing of classes is done or not', default="1")
  28. parser.add_argument('--chanels', type=int, help="given number of chanels if necessary", default=0)
  29. #parser.add_argument('--val_size', type=float, help='percent of validation data during training process', default=cfg.val_dataset_size)
  30. args = parser.parse_args()
  31. p_data_file = args.data
  32. p_output = args.output
  33. p_tl = args.tl
  34. p_batch_size = args.batch_size
  35. p_epochs = args.epochs
  36. p_balancing = bool(args.balancing)
  37. p_chanels = args.chanels
  38. #p_val_size = args.val_size
  39. initial_epoch = 0
  40. ########################
  41. # 1. Get and prepare data
  42. ########################
  43. print("Preparing data...")
  44. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  45. dataset_val = pd.read_csv(p_data_file + '.val', header=None, sep=";")
  46. print("Train set size : ", len(dataset_train))
  47. print("val set size : ", len(dataset_val))
  48. # default first shuffle of data
  49. dataset_train = shuffle(dataset_train)
  50. dataset_val = shuffle(dataset_val)
  51. print("Reading all images data...")
  52. # getting number of chanel
  53. if p_chanels == 0:
  54. n_chanels = len(dataset_train[1][1].split('::'))
  55. else:
  56. n_chanels = p_chanels
  57. print("Number of chanels : ", n_chanels)
  58. img_width, img_height = cfg.keras_img_size
  59. # specify the number of dimensions
  60. if K.image_data_format() == 'chanels_first':
  61. if n_chanels > 1:
  62. input_shape = (1, n_chanels, img_width, img_height)
  63. else:
  64. input_shape = (n_chanels, img_width, img_height)
  65. else:
  66. if n_chanels > 1:
  67. input_shape = (1, img_width, img_height, n_chanels)
  68. else:
  69. input_shape = (img_width, img_height, n_chanels)
  70. # get dataset with equal number of classes occurences if wished
  71. if p_balancing:
  72. print("Balancing of data")
  73. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  74. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  75. nb_noisy_train = len(noisy_df_train.index)
  76. noisy_df_val = dataset_val[dataset_val.iloc[:, 0] == 1]
  77. not_noisy_df_val = dataset_val[dataset_val.iloc[:, 0] == 0]
  78. nb_noisy_val = len(noisy_df_val.index)
  79. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  80. final_df_val = pd.concat([not_noisy_df_val[0:nb_noisy_val], noisy_df_val])
  81. else:
  82. print("No balancing of data")
  83. final_df_train = dataset_train
  84. final_df_val = dataset_val
  85. # check if specific number of chanels is used
  86. if p_chanels == 0:
  87. # `::` is the separator used for getting each img path
  88. if n_chanels > 1:
  89. final_df_train[1] = final_df_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  90. final_df_val[1] = final_df_val[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  91. else:
  92. final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  93. final_df_val[1] = final_df_val[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  94. else:
  95. final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x))
  96. final_df_val[1] = final_df_val[1].apply(lambda x: cv2.imread(x))
  97. # reshape array data
  98. final_df_train[1] = final_df_train[1].apply(lambda x: np.array(x).reshape(input_shape))
  99. final_df_val[1] = final_df_val[1].apply(lambda x: np.array(x).reshape(input_shape))
  100. # shuffle data another time
  101. final_df_train = shuffle(final_df_train)
  102. final_df_val = shuffle(final_df_val)
  103. final_df_train_size = len(final_df_train.index)
  104. final_df_val_size = len(final_df_val.index)
  105. validation_split = final_df_val_size / (final_df_train_size + final_df_val_size)
  106. print("----------------------------------------------------------")
  107. print("Validation size is based of `.val` content")
  108. print("Validation split is now set at", validation_split)
  109. print("----------------------------------------------------------")
  110. # use of the whole data set for training
  111. x_dataset_train = final_df_train.iloc[:,1:]
  112. x_dataset_val = final_df_val.iloc[:,1:]
  113. y_dataset_train = final_df_train.iloc[:,0]
  114. y_dataset_val = final_df_val.iloc[:,0]
  115. x_data_train = []
  116. for item in x_dataset_train.values:
  117. #print("Item is here", item)
  118. x_data_train.append(item[0])
  119. x_data_train = np.array(x_data_train)
  120. x_data_val = []
  121. for item in x_dataset_val.values:
  122. #print("Item is here", item)
  123. x_data_val.append(item[0])
  124. x_data_val = np.array(x_data_val)
  125. print("End of loading data..")
  126. print("Train set size (after balancing) : ", final_df_train_size)
  127. print("val set size (after balancing) : ", final_df_val_size)
  128. #######################
  129. # 2. Getting model
  130. #######################
  131. # create backup folder for current model
  132. model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
  133. if not os.path.exists(model_backup_folder):
  134. os.makedirs(model_backup_folder)
  135. # add of callback models
  136. filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-{auc:02f}-{val_auc:02f}__{epoch:02d}.hdf5")
  137. checkpoint = ModelCheckpoint(filepath, monitor='val_auc', verbose=1, save_best_only=True, mode='max')
  138. callbacks_list = [checkpoint]
  139. # check if backup already exists
  140. weights_filepath = None
  141. backups = sorted(os.listdir(model_backup_folder))
  142. if len(backups) > 0:
  143. # retrieve last backup epoch of model
  144. last_model_backup = None
  145. max_last_epoch = 0
  146. for backup in backups:
  147. last_epoch = int(backup.split('__')[1].replace('.hdf5', ''))
  148. if last_epoch > max_last_epoch and last_epoch < p_epochs:
  149. max_last_epoch = last_epoch
  150. last_model_backup = backup
  151. if last_model_backup is None:
  152. print("Epochs asked is already computer. Noee")
  153. sys.exit(1)
  154. initial_epoch = max_last_epoch
  155. print("-------------------------------------------------")
  156. print("Previous backup model found", last_model_backup, "with already", initial_epoch, "done...")
  157. print("Resuming from epoch", str(initial_epoch + 1))
  158. print("-------------------------------------------------")
  159. # load weights
  160. weights_filepath = os.path.join(model_backup_folder, last_model_backup)
  161. model = models.get_model(n_chanels, input_shape, p_tl, weights_filepath)
  162. model.summary()
  163. # concatenate train and validation data (`validation_split` param will do the separation into keras model)
  164. y_data = np.concatenate([y_dataset_train.values, y_dataset_val.values])
  165. x_data = np.concatenate([x_data_train, x_data_val])
  166. y_data_categorical = to_categorical(y_data)
  167. #print(y_data_categorical)
  168. # validation split parameter will use the last `%` data, so here, data will really validate our model
  169. model.fit(x_data, y_data_categorical, validation_split=validation_split, initial_epoch=initial_epoch, epochs=p_epochs, batch_size=p_batch_size, callbacks=callbacks_list)
  170. y_dataset_val_categorical = to_categorical(y_dataset_val)
  171. score = model.evaluate(x_data_val, y_dataset_val_categorical, batch_size=p_batch_size)
  172. print("Accuracy score on val dataset ", score)
  173. if not os.path.exists(cfg.saved_models_folder):
  174. os.makedirs(cfg.saved_models_folder)
  175. # save the model into HDF5 file
  176. model_output_path = os.path.join(cfg.saved_models_folder, p_output + '.json')
  177. json_model_content = model.to_json()
  178. with open(model_output_path, 'w') as f:
  179. print("Model saved into ", model_output_path)
  180. json.dump(json_model_content, f, indent=4)
  181. model.save_weights(model_output_path.replace('.json', '.h5'))
  182. # Get results obtained from model
  183. y_train_prediction = model.predict(x_data_train)
  184. y_val_prediction = model.predict(x_data_val)
  185. # y_train_prediction = [1 if x > 0.5 else 0 for x in y_train_prediction]
  186. # y_val_prediction = [1 if x > 0.5 else 0 for x in y_val_prediction]
  187. y_train_prediction = np.argmax(y_train_prediction, axis=1)
  188. y_val_prediction = np.argmax(y_val_prediction, axis=1)
  189. acc_train_score = accuracy_score(y_dataset_train, y_train_prediction)
  190. acc_val_score = accuracy_score(y_dataset_val, y_val_prediction)
  191. f1_train_score = f1_score(y_dataset_train, y_train_prediction)
  192. f1_val_score = f1_score(y_dataset_val, y_val_prediction)
  193. recall_train_score = recall_score(y_dataset_train, y_train_prediction)
  194. recall_val_score = recall_score(y_dataset_val, y_val_prediction)
  195. pres_train_score = precision_score(y_dataset_train, y_train_prediction)
  196. pres_val_score = precision_score(y_dataset_val, y_val_prediction)
  197. roc_train_score = roc_auc_score(y_dataset_train, y_train_prediction)
  198. roc_val_score = roc_auc_score(y_dataset_val, y_val_prediction)
  199. # save model performance
  200. if not os.path.exists(cfg.results_information_folder):
  201. os.makedirs(cfg.results_information_folder)
  202. perf_file_path = os.path.join(cfg.results_information_folder, cfg.csv_model_comparisons_filename)
  203. # write header if necessary
  204. if not os.path.exists(perf_file_path):
  205. with open(perf_file_path, 'w') as f:
  206. f.write(cfg.perf_train_header_file)
  207. # add information into file
  208. with open(perf_file_path, 'a') as f:
  209. line = p_output + ';' + str(len(dataset_train)) + ';' + str(len(dataset_val)) + ';' \
  210. + str(final_df_train_size) + ';' + str(final_df_val_size) + ';' \
  211. + str(acc_train_score) + ';' + str(acc_val_score) + ';' \
  212. + str(f1_train_score) + ';' + str(f1_val_score) + ';' \
  213. + str(recall_train_score) + ';' + str(recall_val_score) + ';' \
  214. + str(pres_train_score) + ';' + str(pres_val_score) + ';' \
  215. + str(roc_train_score) + ';' + str(roc_val_score) + '\n'
  216. f.write(line)
  217. print("You can now run your model with your own `test` dataset")
  218. if __name__== "__main__":
  219. main()