train_model.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. import numpy as np
  2. import pandas as pd
  3. import sys, os, argparse
  4. import json
  5. import cv2
  6. from sklearn.utils import shuffle
  7. from keras.preprocessing.image import ImageDataGenerator
  8. from keras.models import Sequential
  9. from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
  10. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  11. from keras import backend as K
  12. import tensorflow as tf
  13. from keras.utils import plot_model
  14. from modules.utils import config as cfg
  15. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
  16. img_width, img_height = cfg.keras_img_size
  17. batch_size = 32
  18. def auc(y_true, y_pred):
  19. auc = tf.metrics.auc(y_true, y_pred)[1]
  20. K.get_session().run(tf.local_variables_initializer())
  21. return auc
  22. def generate_model(_input_shape):
  23. model = Sequential()
  24. model.add(Conv2D(60, (2, 2), input_shape=_input_shape))
  25. model.add(Activation('relu'))
  26. model.add(MaxPooling2D(pool_size=(2, 2)))
  27. model.add(Conv2D(40, (2, 2)))
  28. model.add(Activation('relu'))
  29. model.add(MaxPooling2D(pool_size=(2, 2)))
  30. model.add(Conv2D(20, (2, 2)))
  31. model.add(Activation('relu'))
  32. model.add(MaxPooling2D(pool_size=(2, 2)))
  33. model.add(Flatten())
  34. model.add(Dense(140))
  35. model.add(Activation('relu'))
  36. model.add(BatchNormalization())
  37. model.add(Dropout(0.4))
  38. model.add(Dense(120))
  39. model.add(Activation('relu'))
  40. model.add(BatchNormalization())
  41. model.add(Dropout(0.4))
  42. model.add(Dense(80))
  43. model.add(Activation('relu'))
  44. model.add(BatchNormalization())
  45. model.add(Dropout(0.4))
  46. model.add(Dense(40))
  47. model.add(Activation('relu'))
  48. model.add(BatchNormalization())
  49. model.add(Dropout(0.4))
  50. model.add(Dense(20))
  51. model.add(Activation('relu'))
  52. model.add(BatchNormalization())
  53. model.add(Dropout(0.4))
  54. model.add(Dense(1))
  55. model.add(Activation('sigmoid'))
  56. model.compile(loss='binary_crossentropy',
  57. optimizer='rmsprop',
  58. metrics=['accuracy', auc])
  59. return model
  60. def main():
  61. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  62. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  63. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
  64. parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
  65. parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
  66. parser.add_argument('--val_size', type=int, help='percent of validation data during training process', default=cfg.val_dataset_size)
  67. parser.add_argument('--n_channels', type=int, help='number of canals for 3D', default=1)
  68. args = parser.parse_args()
  69. p_data_file = args.data
  70. p_output = args.output
  71. p_batch_size = args.batch_size
  72. p_epochs = args.epochs
  73. p_val_size = args.val_size
  74. p_n_channels = args.n_channels
  75. ########################
  76. # 1. Get and prepare data
  77. ########################
  78. print("Preparing data...")
  79. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  80. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  81. print("Train set size : ", len(dataset_train))
  82. print("Test set size : ", len(dataset_test))
  83. # default first shuffle of data
  84. dataset_train = shuffle(dataset_train)
  85. dataset_test = shuffle(dataset_test)
  86. print("Reading all images data...")
  87. # getting number of chanel
  88. n_channels = len(dataset_train[1].split(':'))
  89. # `:` is the separator used for getting each img path
  90. if p_n_channels > 1:
  91. dataset_train[1] = dataset_train[1].split(':').apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  92. dataset_test[1] = dataset_test[1].split(':').apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  93. else:
  94. dataset_train[1] = dataset_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  95. dataset_test[1] = dataset_test[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  96. # get dataset with equal number of classes occurences
  97. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  98. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  99. nb_noisy_train = len(noisy_df_train.index)
  100. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  101. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  102. nb_noisy_test = len(noisy_df_test.index)
  103. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  104. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  105. # shuffle data another time
  106. final_df_train = shuffle(final_df_train)
  107. final_df_test = shuffle(final_df_test)
  108. final_df_train_size = len(final_df_train.index)
  109. final_df_test_size = len(final_df_test.index)
  110. # use of the whole data set for training
  111. x_dataset_train = final_df_train.ix[:,1:]
  112. x_dataset_test = final_df_test.ix[:,1:]
  113. y_dataset_train = final_df_train.ix[:,0]
  114. y_dataset_test = final_df_test.ix[:,0]
  115. x_data_train = []
  116. for item in x_dataset_train.values:
  117. #print("Item is here", item)
  118. x_data_train.append(item[0])
  119. x_data_train = np.array(x_data_train)
  120. x_data_test = []
  121. for item in x_dataset_test.values:
  122. #print("Item is here", item)
  123. x_data_test.append(item[0])
  124. x_data_test = np.array(x_data_test)
  125. print("End of loading data..")
  126. print("Train set size (after balancing) : ", final_df_train_size)
  127. print("Test set size (after balancing) : ", final_df_test_size)
  128. #######################
  129. # 2. Getting model
  130. #######################
  131. # specify the number of dimensions
  132. if K.image_data_format() == 'channels_first':
  133. input_shape = (n_channels, img_width, img_height)
  134. else:
  135. input_shape = (img_width, img_height, n_channels)
  136. model = generate_model(input_shape)
  137. model.summary()
  138. model.fit(x_data_train, y_dataset_train.values, validation_split=p_val_size, epochs=p_epochs, batch_size=p_batch_size)
  139. score = model.evaluate(x_data_test, y_dataset_test, batch_size=p_batch_size)
  140. if not os.path.exists(cfg.saved_models_folder):
  141. os.makedirs(cfg.saved_models_folder)
  142. # save the model into HDF5 file
  143. model_output_path = os.path.join(cfg.saved_models_folder, p_output + '.json')
  144. json_model_content = model.to_json()
  145. with open(model_output_path, 'w') as f:
  146. print("Model saved into ", model_output_path)
  147. json.dump(json_model_content, f, indent=4)
  148. model.save_weights(model_output_path.replace('.json', '.h5'))
  149. # Get results obtained from model
  150. y_train_prediction = model.predict(x_data_train)
  151. y_test_prediction = model.predict(x_data_test)
  152. y_train_prediction = [1 if x > 0.5 else 0 for x in y_train_prediction]
  153. y_test_prediction = [1 if x > 0.5 else 0 for x in y_test_prediction]
  154. acc_train_score = accuracy_score(y_dataset_train, y_train_prediction)
  155. acc_test_score = accuracy_score(y_dataset_test, y_test_prediction)
  156. f1_train_score = f1_score(y_dataset_train, y_train_prediction)
  157. f1_test_score = f1_score(y_dataset_test, y_test_prediction)
  158. recall_train_score = recall_score(y_dataset_train, y_train_prediction)
  159. recall_test_score = recall_score(y_dataset_test, y_test_prediction)
  160. pres_train_score = precision_score(y_dataset_train, y_train_prediction)
  161. pres_test_score = precision_score(y_dataset_test, y_test_prediction)
  162. roc_train_score = roc_auc_score(y_dataset_train, y_train_prediction)
  163. roc_test_score = roc_auc_score(y_dataset_test, y_test_prediction)
  164. # save model performance
  165. if not os.path.exists(cfg.models_information_folder):
  166. os.makedirs(cfg.models_information_folder)
  167. perf_file_path = os.path.join(cfg.models_information_folder, cfg.csv_model_comparisons_filename)
  168. with open(perf_file_path, 'a') as f:
  169. line = p_output + ';' + str(len(dataset_train)) + ';' + str(len(dataset_test)) + ';' \
  170. + str(final_df_train_size) + ';' + str(final_df_test_size) + ';' \
  171. + str(acc_train_score) + ';' + str(acc_test_score) + ';' \
  172. + str(f1_train_score) + ';' + str(f1_test_score) + ';' \
  173. + str(recall_train_score) + ';' + str(recall_test_score) + ';' \
  174. + str(pres_train_score) + ';' + str(pres_test_score) + ';' \
  175. + str(roc_train_score) + ';' + str(roc_test_score) + '\n'
  176. f.write(line)
  177. if __name__== "__main__":
  178. main()