train_model.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # main imports
  2. import numpy as np
  3. import pandas as pd
  4. import sys, os, argparse
  5. import json
  6. # model imports
  7. import cnn_models as models
  8. import tensorflow as tf
  9. import keras
  10. from keras import backend as K
  11. from keras.callbacks import ModelCheckpoint
  12. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
  13. from keras.utils import to_categorical
  14. # image processing imports
  15. import cv2
  16. from sklearn.utils import shuffle
  17. from sklearn.model_selection import train_test_split
  18. # config imports
  19. sys.path.insert(0, '') # trick to enable import of main folder module
  20. import custom_config as cfg
  21. def main():
  22. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  23. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .val)', required=True)
  24. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
  25. parser.add_argument('--tl', type=int, help='use or not of transfer learning (`VGG network`)', default=0, choices=[0, 1])
  26. parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=64)
  27. parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=30)
  28. parser.add_argument('--chanels', type=int, help="given number of chanels if necessary", default=0)
  29. parser.add_argument('--size', type=str, help="Size of input images", default="100, 100")
  30. parser.add_argument('--val_size', type=float, help='percent of validation data during training process', default=0.3)
  31. args = parser.parse_args()
  32. p_data_file = args.data
  33. p_output = args.output
  34. p_tl = args.tl
  35. p_batch_size = args.batch_size
  36. p_epochs = args.epochs
  37. p_chanels = args.chanels
  38. p_size = args.size.split(',')
  39. p_val_size = args.val_size
  40. #p_val_size = args.val_size
  41. initial_epoch = 0
  42. ########################
  43. # 1. Get and prepare data
  44. ########################
  45. print("Preparing data...")
  46. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  47. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  48. print("Train set size : ", len(dataset_train))
  49. print("Test set size : ", len(dataset_test))
  50. # default first shuffle of data
  51. dataset_train = shuffle(dataset_train)
  52. dataset_test = shuffle(dataset_test)
  53. print("Reading all images data...")
  54. # getting number of chanel
  55. if p_chanels == 0:
  56. n_chanels = len(dataset_train[1][1].split('::'))
  57. else:
  58. n_chanels = p_chanels
  59. print("Number of chanels : ", n_chanels)
  60. img_width, img_height = [ int(s) for s in p_size ]
  61. # specify the number of dimensions
  62. if K.image_data_format() == 'chanels_first':
  63. if n_chanels > 1:
  64. input_shape = (1, n_chanels, img_width, img_height)
  65. else:
  66. input_shape = (n_chanels, img_width, img_height)
  67. else:
  68. if n_chanels > 1:
  69. input_shape = (1, img_width, img_height, n_chanels)
  70. else:
  71. input_shape = (img_width, img_height, n_chanels)
  72. # getting weighted class over the whole dataset
  73. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  74. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  75. nb_noisy_train = len(noisy_df_train.index)
  76. nb_not_noisy_train = len(not_noisy_df_train.index)
  77. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  78. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  79. nb_noisy_test = len(noisy_df_test.index)
  80. nb_not_noisy_test = len(not_noisy_df_test.index)
  81. noisy_samples = nb_noisy_test + nb_noisy_train
  82. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  83. total_samples = noisy_samples + not_noisy_samples
  84. print('noisy', noisy_samples)
  85. print('not_noisy', not_noisy_samples)
  86. print('total', total_samples)
  87. class_weight = {
  88. 0: (noisy_samples / float(total_samples)),
  89. 1: (not_noisy_samples / float(total_samples)),
  90. }
  91. final_df_train = dataset_train
  92. final_df_test = dataset_test
  93. # check if specific number of chanels is used
  94. if p_chanels == 0:
  95. # `::` is the separator used for getting each img path
  96. if n_chanels > 1:
  97. final_df_train[1] = final_df_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  98. final_df_test[1] = final_df_test[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  99. else:
  100. final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  101. final_df_test[1] = final_df_test[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  102. else:
  103. final_df_train[1] = final_df_train[1].apply(lambda x: cv2.imread(x))
  104. final_df_test[1] = final_df_test[1].apply(lambda x: cv2.imread(x))
  105. # reshape array data
  106. final_df_train[1] = final_df_train[1].apply(lambda x: np.array(x).reshape(input_shape))
  107. final_df_test[1] = final_df_test[1].apply(lambda x: np.array(x).reshape(input_shape))
  108. # shuffle data another time
  109. final_df_train = shuffle(final_df_train)
  110. final_df_test = shuffle(final_df_test)
  111. final_df_train_size = len(final_df_train.index)
  112. final_df_test_size = len(final_df_test.index)
  113. print("----------------------------------------------------------")
  114. print("Validation split is now set at", p_val_size)
  115. print("----------------------------------------------------------")
  116. # use of the whole data set for training
  117. x_dataset_train = final_df_train.iloc[:,1:]
  118. x_dataset_test = final_df_test.iloc[:,1:]
  119. y_dataset_train = final_df_train.iloc[:,0]
  120. y_dataset_test = final_df_test.iloc[:,0]
  121. x_data_train = []
  122. for item in x_dataset_train.values:
  123. #print("Item is here", item)
  124. x_data_train.append(item[0])
  125. x_data_train = np.array(x_data_train)
  126. x_data_test = []
  127. for item in x_dataset_test.values:
  128. #print("Item is here", item)
  129. x_data_test.append(item[0])
  130. x_data_test = np.array(x_data_test)
  131. print("End of loading data..")
  132. print("Train set size (after balancing) : ", final_df_train_size)
  133. print("Test set size (after balancing) : ", final_df_test_size)
  134. #######################
  135. # 2. Getting model
  136. #######################
  137. # create backup folder for current model
  138. model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
  139. if not os.path.exists(model_backup_folder):
  140. os.makedirs(model_backup_folder)
  141. # add of callback models
  142. filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-{accuracy:02f}-{val_accuracy:02f}__{epoch:02d}.hdf5")
  143. checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
  144. callbacks_list = [checkpoint]
  145. # check if backup already exists
  146. weights_filepath = None
  147. backups = sorted(os.listdir(model_backup_folder))
  148. if len(backups) > 0:
  149. # retrieve last backup epoch of model
  150. last_model_backup = None
  151. max_last_epoch = 0
  152. for backup in backups:
  153. last_epoch = int(backup.split('__')[1].replace('.h5', ''))
  154. if last_epoch > max_last_epoch and last_epoch < p_epochs:
  155. max_last_epoch = last_epoch
  156. last_model_backup = backup
  157. if last_model_backup is None:
  158. print("Epochs asked is already computer. Noee")
  159. sys.exit(1)
  160. initial_epoch = max_last_epoch
  161. print("-------------------------------------------------")
  162. print("Previous backup model found", last_model_backup, "with already", initial_epoch, " epoch(s) done...")
  163. print("Resuming from epoch", str(initial_epoch + 1))
  164. print("-------------------------------------------------")
  165. # load weights
  166. weights_filepath = os.path.join(model_backup_folder, last_model_backup)
  167. print(n_chanels)
  168. model = models.get_model(n_chanels, input_shape, p_tl, weights_filepath)
  169. model.summary()
  170. # prepare train and validation dataset
  171. X_train, X_val, y_train, y_val = train_test_split(x_data_train, y_dataset_train, test_size=p_val_size, shuffle=False)
  172. y_train = to_categorical(y_train)
  173. y_val = to_categorical(y_val)
  174. y_test = to_categorical(y_dataset_test)
  175. print("Fitting model with custom class_weight", class_weight)
  176. model.fit(X_train, y_train,
  177. validation_data=(X_val, y_val),
  178. initial_epoch=initial_epoch,
  179. epochs=p_epochs,
  180. batch_size=p_batch_size,
  181. callbacks=callbacks_list,
  182. class_weight=class_weight)
  183. score = model.evaluate(X_val, y_val, batch_size=p_batch_size)
  184. print("Accuracy score on val dataset ", score)
  185. if not os.path.exists(cfg.output_models):
  186. os.makedirs(cfg.output_models)
  187. # save the model into HDF5 file
  188. model_output_path = os.path.join(cfg.output_models, p_output + '.h5')
  189. model.save(model_output_path)
  190. # Get results obtained from model
  191. y_train_prediction = model.predict(X_train)
  192. y_val_prediction = model.predict(X_val)
  193. y_test_prediction = model.predict(x_dataset_test)
  194. # y_train_prediction = [1 if x > 0.5 else 0 for x in y_train_prediction]
  195. # y_val_prediction = [1 if x > 0.5 else 0 for x in y_val_prediction]
  196. y_train_prediction = np.argmax(y_train_prediction, axis=1)
  197. y_val_prediction = np.argmax(y_val_prediction, axis=1)
  198. acc_train_score = accuracy_score(y_train, y_train_prediction)
  199. acc_val_score = accuracy_score(y_val, y_val_prediction)
  200. acc_test_score = accuracy_score(y_test, y_test_prediction)
  201. roc_train_score = roc_auc_score(y_train, y_train_prediction)
  202. roc_val_score = roc_auc_score(y_val, y_val_prediction)
  203. roc_test_score = roc_auc_score(y_test, y_val_prediction)
  204. # save model performance
  205. if not os.path.exists(cfg.output_results_folder):
  206. os.makedirs(cfg.output_results_folder)
  207. perf_file_path = os.path.join(cfg.output_results_folder, cfg.csv_model_comparisons_filename)
  208. # write header if necessary
  209. if not os.path.exists(perf_file_path):
  210. with open(perf_file_path, 'w') as f:
  211. f.write('name;train_acc;val_acc;test_acc;train_auc;val_auc;test_auc;\n')
  212. # add information into file
  213. with open(perf_file_path, 'a') as f:
  214. line = p_output + ';' + str(acc_train_score) + ';' + str(acc_val_score) + ';' \
  215. + str(acc_test_score) + ';' + str(roc_train_score) + ';' \
  216. + str(roc_val_score) + ';' + str(roc_test_score) + '\n'
  217. f.write(line)
  218. print("You can now run your model with your own `test` dataset")
  219. if __name__== "__main__":
  220. main()