train_keras_svd.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. # main imports
  2. import sys, os
  3. import argparse
  4. import json
  5. import numpy as np
  6. import pandas as pd
  7. import logging
  8. # models imports
  9. from keras.preprocessing.image import ImageDataGenerator
  10. from keras.models import Sequential
  11. from keras.layers import Conv1D, MaxPooling1D
  12. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  13. from keras.wrappers.scikit_learn import KerasClassifier
  14. from keras import backend as K
  15. from keras.callbacks import Callback
  16. from sklearn.utils import shuffle
  17. from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
  18. # modules and config imports
  19. import custom_config as cfg
  20. def f1(y_true, y_pred):
  21. def recall(y_true, y_pred):
  22. """Recall metric.
  23. Only computes a batch-wise average of recall.
  24. Computes the recall, a metric for multi-label classification of
  25. how many relevant items are selected.
  26. """
  27. true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  28. possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  29. recall = true_positives / (possible_positives + K.epsilon())
  30. return recall
  31. def precision(y_true, y_pred):
  32. """Precision metric.
  33. Only computes a batch-wise average of precision.
  34. Computes the precision, a metric for multi-label classification of
  35. how many selected items are relevant.
  36. """
  37. true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  38. predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  39. precision = true_positives / (predicted_positives + K.epsilon())
  40. return precision
  41. precision = precision(y_true, y_pred)
  42. recall = recall(y_true, y_pred)
  43. return 2*((precision*recall)/(precision+recall+K.epsilon()))
  44. class IntervalEvaluation(Callback):
  45. def __init__(self, validation_data=(), interval=1):
  46. super(Callback, self).__init__()
  47. self.interval = interval
  48. self.X_val, self.y_val = validation_data
  49. def on_epoch_end(self, epoch, logs={}):
  50. if epoch % self.interval == 0:
  51. y_pred = self.model.predict_proba(self.X_val, verbose=0)
  52. y_pred = [ 0 if y < 0.5 else 1 for y in y_pred ]
  53. auc_score = roc_auc_score(self.y_val, y_pred)
  54. acc_score = accuracy_score(self.y_val, y_pred)
  55. f1_test_score = f1_score(self.y_val, y_pred)
  56. print("------------------------------")
  57. print("[test dataset] for epoch {:d}".format(epoch + 1))
  58. print("ROC AUC : {:.6f}".format(auc_score))
  59. print("ACCURACY: {:.6f}".format(acc_score))
  60. print("F1 score: {:.6f}".format(f1_test_score))
  61. print("------------------------------")
  62. def generate_model(input_shape):
  63. model = Sequential()
  64. #model.add(Conv1D(128, (10), input_shape=input_shape))
  65. #model.add(Activation('relu'))
  66. #model.add(Conv1D(128, (10)))
  67. #model.add(Activation('relu'))
  68. #model.add(Conv1D(128, (10)))
  69. #model.add(Activation('relu'))
  70. #model.add(MaxPooling1D(pool_size=(2)))
  71. #model.add(Conv1D(64, (10)))
  72. #model.add(Activation('relu'))
  73. #model.add(Conv1D(64, (10)))
  74. #model.add(Activation('relu'))
  75. #model.add(Conv1D(64, (10)))
  76. #model.add(Activation('relu'))
  77. #model.add(MaxPooling1D(pool_size=(2)))
  78. #model.add(Conv1D(32, (10)))
  79. #model.add(Activation('relu'))
  80. #model.add(Conv1D(32, (10)))
  81. #model.add(Activation('relu'))
  82. #model.add(Conv1D(32, (10)))
  83. #model.add(Activation('relu'))
  84. #model.add(MaxPooling1D(pool_size=(2)))
  85. model.add(Flatten(input_shape=input_shape))
  86. # model.add(Dense(2048))
  87. # model.add(Activation('relu'))
  88. # model.add(BatchNormalization())
  89. # model.add(Dropout(0.2))
  90. model.add(Dense(1024))
  91. model.add(Activation('relu'))
  92. model.add(BatchNormalization())
  93. model.add(Dropout(0.4))
  94. model.add(Dense(512))
  95. model.add(Activation('relu'))
  96. model.add(BatchNormalization())
  97. model.add(Dropout(0.4))
  98. model.add(Dense(256))
  99. model.add(Activation('relu'))
  100. model.add(BatchNormalization())
  101. model.add(Dropout(0.4))
  102. model.add(Dense(128))
  103. model.add(Activation('relu'))
  104. model.add(BatchNormalization())
  105. model.add(Dropout(0.4))
  106. model.add(Dense(20))
  107. model.add(Activation('relu'))
  108. model.add(BatchNormalization())
  109. model.add(Dropout(0.4))
  110. model.add(Dense(1))
  111. model.add(Activation('sigmoid'))
  112. model.compile(loss='binary_crossentropy',
  113. optimizer='rmsprop',
  114. metrics=['accuracy', f1])
  115. return model
  116. def main():
  117. parser = argparse.ArgumentParser(description="Process deep_network_keras_svd.py parameters")
  118. parser.add_argument('--data', type=str, help='Data filename prefix to access train and test dataset')
  119. parser.add_argument('--output', type=str, help='Name of filename to save model into')
  120. parser.add_argument('--size', type=int, help='Size of input data vector')
  121. args = parser.parse_args()
  122. p_datafile = args.data
  123. p_output_filename = args.output
  124. p_vector_size = args.size
  125. epochs = 10
  126. batch_size = cfg.keras_batch
  127. input_shape = (p_vector_size, 1)
  128. ###########################
  129. # 1. Get and prepare data
  130. ###########################
  131. dataset_train = pd.read_csv(p_datafile + '.train', header=None, sep=";")
  132. dataset_test = pd.read_csv(p_datafile + '.test', header=None, sep=";")
  133. # default first shuffle of data
  134. dataset_train = shuffle(dataset_train)
  135. dataset_test = shuffle(dataset_test)
  136. # get dataset with equal number of classes occurences
  137. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  138. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  139. nb_noisy_train = len(noisy_df_train.index)
  140. nb_not_noisy_train = len(not_noisy_df_train.index)
  141. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  142. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  143. nb_noisy_test = len(noisy_df_test.index)
  144. nb_not_noisy_test = len(not_noisy_df_test.index)
  145. final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
  146. final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
  147. # shuffle data another time
  148. final_df_train = shuffle(final_df_train)
  149. final_df_test = shuffle(final_df_test)
  150. # use of the whole data set for training
  151. x_dataset_train = final_df_train.iloc[:,1:]
  152. x_dataset_test = final_df_test.iloc[:,1:]
  153. y_dataset_train = final_df_train.iloc[:,0]
  154. y_dataset_test = final_df_test.iloc[:,0]
  155. noisy_samples = nb_noisy_test + nb_noisy_train
  156. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  157. total_samples = noisy_samples + not_noisy_samples
  158. print('noisy', noisy_samples)
  159. print('not_noisy', not_noisy_samples)
  160. print('total', total_samples)
  161. class_weight = {
  162. 0: noisy_samples / float(total_samples),
  163. 1: not_noisy_samples / float(total_samples)
  164. }
  165. print(class_weight)
  166. #######################
  167. # 2. Getting model
  168. #######################
  169. model = generate_model(input_shape)
  170. model.summary()
  171. #model = KerasClassifier(build_fn=model, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch, verbose=0)
  172. #######################
  173. # 3. Fit model : use of cross validation to fit model
  174. #######################
  175. # reshape input data
  176. x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), p_vector_size, 1)
  177. x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), p_vector_size, 1)
  178. ival = IntervalEvaluation(validation_data=(x_dataset_test, y_dataset_test), interval=1)
  179. model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch, callbacks=[ival], class_weight=class_weight)
  180. score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=batch_size)
  181. if not os.path.exists(cfg.saved_models_folder):
  182. os.makedirs(cfg.saved_models_folder)
  183. # save the model into HDF5 file
  184. model_output_path = os.path.join(cfg.saved_models_folder, p_output_filename + '.json')
  185. json_model_content = model.to_json()
  186. with open(model_output_path, 'w') as f:
  187. print("Model saved into ", model_output_path)
  188. json.dump(json_model_content, f, indent=4)
  189. model.save_weights(model_output_path.replace('.json', '.h5'))
  190. # Save results obtained from model
  191. y_test_prediction = model.predict(x_dataset_test)
  192. y_test_prediction = [ 0 if y < 0.5 else 1 for y in y_test_prediction ]
  193. print("Metrics : ", model.metrics_names)
  194. print("ACC score : ", accuracy_score(y_dataset_test, y_test_prediction))
  195. print("F1 score : ", f1_score(y_dataset_test, y_test_prediction))
  196. print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction))
  197. if __name__== "__main__":
  198. main()