train_model.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. import numpy as np
  2. import pandas as pd
  3. import sys, os, argparse
  4. import json
  5. import cv2
  6. from sklearn.utils import shuffle
  7. from keras.preprocessing.image import ImageDataGenerator
  8. from keras.models import Sequential
  9. from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
  10. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  11. from keras import backend as K
  12. from keras.utils import plot_model
  13. from modules.utils import config as cfg
  14. from sklearn.metrics import roc_auc_score
  15. img_width, img_height = 200, 200
  16. batch_size = 32
  17. # 1 because we have 1 color canal
  18. if K.image_data_format() == 'channels_first':
  19. input_shape = (1, img_width, img_height)
  20. else:
  21. input_shape = (img_width, img_height, 1)
  22. def generate_model(_input_shape):
  23. model = Sequential()
  24. model.add(Conv2D(60, (2, 2), input_shape=_input_shape))
  25. model.add(Activation('relu'))
  26. model.add(MaxPooling2D(pool_size=(2, 2)))
  27. model.add(Conv2D(40, (2, 2)))
  28. model.add(Activation('relu'))
  29. model.add(MaxPooling2D(pool_size=(2, 2)))
  30. model.add(Conv2D(20, (2, 2)))
  31. model.add(Activation('relu'))
  32. model.add(MaxPooling2D(pool_size=(2, 2)))
  33. model.add(Flatten())
  34. model.add(Dense(140))
  35. model.add(Activation('relu'))
  36. model.add(BatchNormalization())
  37. model.add(Dropout(0.4))
  38. model.add(Dense(120))
  39. model.add(Activation('relu'))
  40. model.add(BatchNormalization())
  41. model.add(Dropout(0.4))
  42. model.add(Dense(80))
  43. model.add(Activation('relu'))
  44. model.add(BatchNormalization())
  45. model.add(Dropout(0.4))
  46. model.add(Dense(40))
  47. model.add(Activation('relu'))
  48. model.add(BatchNormalization())
  49. model.add(Dropout(0.4))
  50. model.add(Dense(20))
  51. model.add(Activation('relu'))
  52. model.add(BatchNormalization())
  53. model.add(Dropout(0.4))
  54. model.add(Dense(1))
  55. model.add(Activation('sigmoid'))
  56. model.compile(loss='binary_crossentropy',
  57. optimizer='rmsprop',
  58. metrics=['accuracy'])
  59. return model
  60. def main():
  61. parser = argparse.ArgumentParser(description="Train Keras model and save it into .json file")
  62. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
  63. parser.add_argument('--output', type=str, help='output file name desired for model (without .json extension)')
  64. args = parser.parse_args()
  65. p_data_file = args.data
  66. p_output = args.output
  67. ########################
  68. # 1. Get and prepare data
  69. ########################
  70. print("Preparing data...")
  71. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  72. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  73. # default first shuffle of data
  74. dataset_train = shuffle(dataset_train)
  75. dataset_test = shuffle(dataset_test)
  76. print("Reading all images data...")
  77. dataset_train[1] = dataset_train[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  78. dataset_test[1] = dataset_test[1].apply(lambda x: cv2.imread(x, cv2.IMREAD_GRAYSCALE).reshape(input_shape))
  79. # get dataset with equal number of classes occurences
  80. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  81. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  82. nb_noisy_train = len(noisy_df_train.index)
  83. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  84. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  85. nb_noisy_test = len(noisy_df_test.index)
  86. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  87. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  88. # shuffle data another time
  89. final_df_train = shuffle(final_df_train)
  90. final_df_test = shuffle(final_df_test)
  91. final_df_train_size = len(final_df_train.index)
  92. final_df_test_size = len(final_df_test.index)
  93. # use of the whole data set for training
  94. x_dataset_train = final_df_train.ix[:,1:]
  95. x_dataset_test = final_df_test.ix[:,1:]
  96. y_dataset_train = final_df_train.ix[:,0]
  97. y_dataset_test = final_df_test.ix[:,0]
  98. x_data_train = []
  99. for item in x_dataset_train.values:
  100. #print("Item is here", item)
  101. x_data_train.append(item[0])
  102. x_data_train = np.array(x_data_train)
  103. print("End of loading data..")
  104. print(x_data_train.shape)
  105. print(x_data_train[0])
  106. #######################
  107. # 2. Getting model
  108. #######################
  109. model = generate_model(input_shape)
  110. model.summary()
  111. model.fit(x_data_train, y_dataset_train.values, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
  112. score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=cfg.keras_batch)
  113. if not os.path.exists(cfg.saved_models_folder):
  114. os.makedirs(cfg.saved_models_folder)
  115. # save the model into HDF5 file
  116. model_output_path = os.path.join(cfg.saved_models_folder, p_output + '.json')
  117. json_model_content = model.to_json()
  118. with open(model_output_path, 'w') as f:
  119. print("Model saved into ", model_output_path)
  120. json.dump(json_model_content, f, indent=4)
  121. model.save_weights(model_output_path.replace('.json', '.h5'))
  122. # Save results obtained from model
  123. y_test_prediction = model.predict(x_dataset_test)
  124. print("Metrics : ", model.metrics_names)
  125. print("Prediction : ", score)
  126. print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction))
  127. if __name__== "__main__":
  128. main()