train_model.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. # main imports
  2. import numpy as np
  3. import pandas as pd
  4. import sys, os, argparse
  5. import json
  6. # model imports
  7. import cnn_models as models
  8. import tensorflow as tf
  9. import keras
  10. from keras.models import load_model
  11. from keras import backend as K
  12. from keras.callbacks import ModelCheckpoint
  13. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
  14. from keras.utils import to_categorical
  15. # image processing imports
  16. import cv2
  17. from sklearn.utils import shuffle
  18. from sklearn.model_selection import train_test_split
  19. # config imports
  20. sys.path.insert(0, '') # trick to enable import of main folder module
  21. import custom_config as cfg
  22. # counter param
  23. n_counter = 0
  24. def write_progress(progress):
  25. '''
  26. Display progress information as progress bar
  27. '''
  28. barWidth = 180
  29. output_str = "["
  30. pos = barWidth * progress
  31. for i in range(barWidth):
  32. if i < pos:
  33. output_str = output_str + "="
  34. elif i == pos:
  35. output_str = output_str + ">"
  36. else:
  37. output_str = output_str + " "
  38. output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
  39. print(output_str)
  40. sys.stdout.write("\033[F")
  41. def main():
  42. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  43. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  44. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
  45. parser.add_argument('--tl', type=int, help='use or not of transfer learning (`VGG network`)', default=0, choices=[0, 1])
  46. parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=64)
  47. parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=30)
  48. parser.add_argument('--chanels', type=str, help="given number of ordered chanels for each input images (example: '1,3,3')", required=True)
  49. parser.add_argument('--size', type=str, help="Size of input images", default="100, 100")
  50. parser.add_argument('--val_size', type=float, help='percent of validation data during training process', default=0.3)
  51. args = parser.parse_args()
  52. p_data_file = args.data
  53. p_output = args.output
  54. p_tl = args.tl
  55. p_batch_size = args.batch_size
  56. p_epochs = args.epochs
  57. p_chanels = list(map(int, args.chanels.split(',')))
  58. p_size = args.size.split(',')
  59. p_val_size = args.val_size
  60. #p_val_size = args.val_size
  61. initial_epoch = 0
  62. ########################
  63. # 1. Get and prepare data
  64. ########################
  65. print('-----------------------------')
  66. print("----- Preparing data... -----")
  67. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  68. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  69. print("-- Train set size : ", len(dataset_train))
  70. print("-- Test set size : ", len(dataset_test))
  71. # default first shuffle of data
  72. dataset_train = shuffle(dataset_train)
  73. dataset_test = shuffle(dataset_test)
  74. print('-----------------------------')
  75. print("--Reading all images data...")
  76. # getting number of chanel
  77. n_chanels = sum(p_chanels)
  78. print("-- Number of chanels : ", n_chanels)
  79. img_width, img_height = [ int(s) for s in p_size ]
  80. # specify the number of dimensions
  81. if K.image_data_format() == 'chanels_first':
  82. if n_chanels > 1:
  83. input_shape = (1, n_chanels, img_width, img_height)
  84. else:
  85. input_shape = (n_chanels, img_width, img_height)
  86. else:
  87. if n_chanels > 1:
  88. input_shape = (1, img_width, img_height, n_chanels)
  89. else:
  90. input_shape = (img_width, img_height, n_chanels)
  91. # getting weighted class over the whole dataset
  92. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  93. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  94. nb_noisy_train = len(noisy_df_train.index)
  95. nb_not_noisy_train = len(not_noisy_df_train.index)
  96. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  97. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  98. nb_noisy_test = len(noisy_df_test.index)
  99. nb_not_noisy_test = len(not_noisy_df_test.index)
  100. noisy_samples = nb_noisy_test + nb_noisy_train
  101. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  102. total_samples = noisy_samples + not_noisy_samples
  103. print('-----------------------------')
  104. print('---- Dataset information ----')
  105. print('-- noisy:', noisy_samples)
  106. print('-- not_noisy:', not_noisy_samples)
  107. print('-- total:', total_samples)
  108. print('-----------------------------')
  109. class_weight = {
  110. 0: (noisy_samples / float(total_samples)),
  111. 1: (not_noisy_samples / float(total_samples)),
  112. }
  113. final_df_train = dataset_train
  114. final_df_test = dataset_test
  115. def load_images(x):
  116. # update progress
  117. global n_counter
  118. n_counter += 1
  119. write_progress(n_counter / float(total_samples))
  120. images = []
  121. for i, path in enumerate(x.split('::')):
  122. if p_chanels[i] > 1:
  123. img = cv2.imread(path)
  124. else:
  125. img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
  126. images.append(img)
  127. return images
  128. print('---- Loading dataset.... ----')
  129. print('-----------------------------\n')
  130. # check if specific number of chanels is used
  131. final_df_train[1] = final_df_train[1].apply(lambda x: load_images(x))
  132. final_df_test[1] = final_df_test[1].apply(lambda x: load_images(x))
  133. # reshape array data
  134. final_df_train[1] = final_df_train[1].apply(lambda x: np.array(x).reshape(input_shape))
  135. final_df_test[1] = final_df_test[1].apply(lambda x: np.array(x).reshape(input_shape))
  136. # shuffle data another time
  137. final_df_train = shuffle(final_df_train)
  138. final_df_test = shuffle(final_df_test)
  139. print('\n-----------------------------')
  140. print("Validation split is now set at", p_val_size)
  141. print('-----------------------------')
  142. # use of the whole data set for training
  143. x_dataset_train = final_df_train.iloc[:,1:]
  144. x_dataset_test = final_df_test.iloc[:,1:]
  145. y_dataset_train = final_df_train.iloc[:,0]
  146. y_dataset_test = final_df_test.iloc[:,0]
  147. x_data_train = []
  148. for item in x_dataset_train.values:
  149. #print("Item is here", item)
  150. x_data_train.append(item[0])
  151. x_data_train = np.array(x_data_train)
  152. x_data_test = []
  153. for item in x_dataset_test.values:
  154. #print("Item is here", item)
  155. x_data_test.append(item[0])
  156. x_data_test = np.array(x_data_test)
  157. #######################
  158. # 2. Getting model
  159. #######################
  160. # create backup folder for current model
  161. model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
  162. if not os.path.exists(model_backup_folder):
  163. os.makedirs(model_backup_folder)
  164. # add of callback models
  165. filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-_{epoch:03d}.h5")
  166. checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=0, mode='max')
  167. callbacks_list = [checkpoint]
  168. # check if backup already exists
  169. backups = sorted(os.listdir(model_backup_folder))
  170. if len(backups) > 0:
  171. last_backup_file = backups[-1]
  172. last_backup_file_path = os.path.join(model_backup_folder, last_backup_file)
  173. model = load_model(last_backup_file_path)
  174. # get initial epoch
  175. initial_epoch = int(last_backup_file.split('_')[-1].replace('.h5', ''))
  176. print('-----------------------------')
  177. print('-- Restore model from backup...')
  178. print('-- Restart training @epoch:', initial_epoch)
  179. print('-----------------------------')
  180. else:
  181. model = models.get_model(n_chanels, input_shape, p_tl)
  182. model.summary()
  183. # prepare train and validation dataset
  184. X_train, X_val, y_train, y_val = train_test_split(x_data_train, y_dataset_train, test_size=p_val_size, shuffle=False)
  185. y_train_cat = to_categorical(y_train)
  186. y_val_cat = to_categorical(y_val)
  187. y_test_cat = to_categorical(y_dataset_test)
  188. print('-----------------------------')
  189. print("-- Fitting model with custom class_weight", class_weight)
  190. print('-----------------------------')
  191. model.fit(X_train, y_train_cat,
  192. validation_data=(X_val, y_val_cat),
  193. initial_epoch=initial_epoch,
  194. epochs=p_epochs,
  195. batch_size=p_batch_size,
  196. callbacks=callbacks_list,
  197. class_weight=class_weight)
  198. score = model.evaluate(X_val, y_val_cat, batch_size=p_batch_size)
  199. print("Accuracy score on val dataset ", score)
  200. if not os.path.exists(cfg.output_models):
  201. os.makedirs(cfg.output_models)
  202. # save the model into H5 file
  203. model_output_path = os.path.join(cfg.output_models, p_output + '.h5')
  204. model.save(model_output_path)
  205. print('Begin of prediction score on the whole dataset:')
  206. # Get results obtained from model
  207. y_train_prediction = model.predict(X_train, verbose=1)
  208. y_val_prediction = model.predict(X_val, verbose=1)
  209. y_test_prediction = model.predict(x_data_test, verbose=1)
  210. y_train_prediction = np.argmax(y_train_prediction, axis=1)
  211. y_val_prediction = np.argmax(y_val_prediction, axis=1)
  212. y_test_prediction = np.argmax(y_test_prediction, axis=1)
  213. acc_train_score = accuracy_score(y_train, y_train_prediction)
  214. acc_val_score = accuracy_score(y_val, y_val_prediction)
  215. acc_test_score = accuracy_score(y_dataset_test, y_test_prediction)
  216. roc_train_score = roc_auc_score(y_train, y_train_prediction)
  217. roc_val_score = roc_auc_score(y_val, y_val_prediction)
  218. roc_test_score = roc_auc_score(y_dataset_test, y_test_prediction)
  219. # save model performance
  220. if not os.path.exists(cfg.output_results_folder):
  221. os.makedirs(cfg.output_results_folder)
  222. perf_file_path = os.path.join(cfg.output_results_folder, cfg.csv_model_comparisons_filename)
  223. # write header if necessary
  224. if not os.path.exists(perf_file_path):
  225. with open(perf_file_path, 'w') as f:
  226. f.write('name;train_acc;val_acc;test_acc;train_auc;val_auc;test_auc;\n')
  227. # add information into file
  228. with open(perf_file_path, 'a') as f:
  229. line = p_output + ';' + str(acc_train_score) + ';' + str(acc_val_score) + ';' \
  230. + str(acc_test_score) + ';' + str(roc_train_score) + ';' \
  231. + str(roc_val_score) + ';' + str(roc_test_score) + '\n'
  232. f.write(line)
  233. print("You can now run your model with your own `test` dataset")
  234. if __name__== "__main__":
  235. main()