train_model.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import numpy as np
  2. import pandas as pd
  3. import sys, os, argparse
  4. import json
  5. import cv2
  6. from sklearn.utils import shuffle
  7. from modules.utils import config as cfg
  8. from modules.models import models
  9. from keras import backend as K
  10. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
  11. def main():
  12. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  13. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)', required=True)
  14. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)', required=True)
  15. parser.add_argument('--batch_size', type=int, help='batch size used as model input', default=cfg.keras_batch)
  16. parser.add_argument('--epochs', type=int, help='number of epochs used for training model', default=cfg.keras_epochs)
  17. parser.add_argument('--val_size', type=int, help='percent of validation data during training process', default=cfg.val_dataset_size)
  18. args = parser.parse_args()
  19. p_data_file = args.data
  20. p_output = args.output
  21. p_batch_size = args.batch_size
  22. p_epochs = args.epochs
  23. p_val_size = args.val_size
  24. ########################
  25. # 1. Get and prepare data
  26. ########################
  27. print("Preparing data...")
  28. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  29. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  30. print("Train set size : ", len(dataset_train))
  31. print("Test set size : ", len(dataset_test))
  32. # default first shuffle of data
  33. dataset_train = shuffle(dataset_train)
  34. dataset_test = shuffle(dataset_test)
  35. print("Reading all images data...")
  36. # getting number of chanel
  37. n_channels = len(dataset_train[1][1].split('::'))
  38. print("Number of channels : ", n_channels)
  39. img_width, img_height = cfg.keras_img_size
  40. # specify the number of dimensions
  41. if K.image_data_format() == 'channels_first':
  42. if n_channels > 1:
  43. input_shape = (1, n_channels, img_width, img_height)
  44. else:
  45. input_shape = (n_channels, img_width, img_height)
  46. else:
  47. if n_channels > 1:
  48. input_shape = (1, img_width, img_height, n_channels)
  49. else:
  50. input_shape = (img_width, img_height, n_channels)
  51. # `:` is the separator used for getting each img path
  52. if n_channels > 1:
  53. dataset_train[1] = dataset_train[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  54. dataset_test[1] = dataset_test[1].apply(lambda x: [cv2.imread(path, cv2.IMREAD_GRAYSCALE) for path in x.split('::')])
  55. else:
  56. dataset_train[1] = dataset_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  57. dataset_test[1] = dataset_test[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE))
  58. # reshape array data
  59. dataset_train[1] = dataset_train[1].apply(lambda x: np.array(x).reshape(input_shape))
  60. dataset_test[1] = dataset_test[1].apply(lambda x: np.array(x).reshape(input_shape))
  61. # get dataset with equal number of classes occurences
  62. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  63. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  64. nb_noisy_train = len(noisy_df_train.index)
  65. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  66. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  67. nb_noisy_test = len(noisy_df_test.index)
  68. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  69. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  70. # shuffle data another time
  71. final_df_train = shuffle(final_df_train)
  72. final_df_test = shuffle(final_df_test)
  73. final_df_train_size = len(final_df_train.index)
  74. final_df_test_size = len(final_df_test.index)
  75. # use of the whole data set for training
  76. x_dataset_train = final_df_train.ix[:,1:]
  77. x_dataset_test = final_df_test.ix[:,1:]
  78. y_dataset_train = final_df_train.ix[:,0]
  79. y_dataset_test = final_df_test.ix[:,0]
  80. x_data_train = []
  81. for item in x_dataset_train.values:
  82. #print("Item is here", item)
  83. x_data_train.append(item[0])
  84. x_data_train = np.array(x_data_train)
  85. x_data_test = []
  86. for item in x_dataset_test.values:
  87. #print("Item is here", item)
  88. x_data_test.append(item[0])
  89. x_data_test = np.array(x_data_test)
  90. print("End of loading data..")
  91. print("Train set size (after balancing) : ", final_df_train_size)
  92. print("Test set size (after balancing) : ", final_df_test_size)
  93. #######################
  94. # 2. Getting model
  95. #######################
  96. model = models.get_model(n_channels, input_shape)
  97. model.summary()
  98. model.fit(x_data_train, y_dataset_train.values, validation_split=p_val_size, epochs=p_epochs, batch_size=p_batch_size)
  99. score = model.evaluate(x_data_test, y_dataset_test, batch_size=p_batch_size)
  100. if not os.path.exists(cfg.saved_models_folder):
  101. os.makedirs(cfg.saved_models_folder)
  102. # save the model into HDF5 file
  103. model_output_path = os.path.join(cfg.saved_models_folder, p_output + '.json')
  104. json_model_content = model.to_json()
  105. with open(model_output_path, 'w') as f:
  106. print("Model saved into ", model_output_path)
  107. json.dump(json_model_content, f, indent=4)
  108. model.save_weights(model_output_path.replace('.json', '.h5'))
  109. # Get results obtained from model
  110. y_train_prediction = model.predict(x_data_train)
  111. y_test_prediction = model.predict(x_data_test)
  112. y_train_prediction = [1 if x > 0.5 else 0 for x in y_train_prediction]
  113. y_test_prediction = [1 if x > 0.5 else 0 for x in y_test_prediction]
  114. acc_train_score = accuracy_score(y_dataset_train, y_train_prediction)
  115. acc_test_score = accuracy_score(y_dataset_test, y_test_prediction)
  116. f1_train_score = f1_score(y_dataset_train, y_train_prediction)
  117. f1_test_score = f1_score(y_dataset_test, y_test_prediction)
  118. recall_train_score = recall_score(y_dataset_train, y_train_prediction)
  119. recall_test_score = recall_score(y_dataset_test, y_test_prediction)
  120. pres_train_score = precision_score(y_dataset_train, y_train_prediction)
  121. pres_test_score = precision_score(y_dataset_test, y_test_prediction)
  122. roc_train_score = roc_auc_score(y_dataset_train, y_train_prediction)
  123. roc_test_score = roc_auc_score(y_dataset_test, y_test_prediction)
  124. # save model performance
  125. if not os.path.exists(cfg.models_information_folder):
  126. os.makedirs(cfg.models_information_folder)
  127. perf_file_path = os.path.join(cfg.models_information_folder, cfg.csv_model_comparisons_filename)
  128. with open(perf_file_path, 'a') as f:
  129. line = p_output + ';' + str(len(dataset_train)) + ';' + str(len(dataset_test)) + ';' \
  130. + str(final_df_train_size) + ';' + str(final_df_test_size) + ';' \
  131. + str(acc_train_score) + ';' + str(acc_test_score) + ';' \
  132. + str(f1_train_score) + ';' + str(f1_test_score) + ';' \
  133. + str(recall_train_score) + ';' + str(recall_test_score) + ';' \
  134. + str(pres_train_score) + ';' + str(pres_test_score) + ';' \
  135. + str(roc_train_score) + ';' + str(roc_test_score) + '\n'
  136. f.write(line)
  137. if __name__== "__main__":
  138. main()