train_lstm_weighted.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. # main imports
  2. import argparse
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import ctypes
  7. from PIL import Image
  8. from keras import backend as K
  9. import matplotlib.pyplot as plt
  10. from ipfml import utils
  11. # dl imports
  12. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization, ConvLSTM2D, Conv3D, Flatten
  13. from keras.preprocessing.sequence import pad_sequences
  14. from keras.models import Sequential
  15. from sklearn.metrics import roc_auc_score, accuracy_score
  16. import tensorflow as tf
  17. from keras import backend as K
  18. import sklearn
  19. from sklearn.model_selection import train_test_split
  20. from joblib import dump
  21. import custom_config as cfg
  22. def build_input(df, seq_norm):
  23. """Convert dataframe to numpy array input with timesteps as float array
  24. Arguments:
  25. df: {pd.Dataframe} -- Dataframe input
  26. seq_norm: {bool} -- normalize or not seq input data by features
  27. Returns:
  28. {np.ndarray} -- input LSTM data as numpy array
  29. """
  30. arr = []
  31. # for each input line
  32. for row in df.iterrows():
  33. seq_arr = []
  34. # for each sequence data input
  35. for column in row[1]:
  36. seq_elems = []
  37. # for each element in sequence data
  38. for img_path in column:
  39. img = Image.open(img_path)
  40. # seq_elems.append(np.array(img).flatten())
  41. seq_elems.append(np.array(img) / 255.)
  42. #seq_arr.append(np.array(seq_elems).flatten())
  43. seq_arr.append(np.array(seq_elems))
  44. arr.append(seq_arr)
  45. arr = np.array(arr)
  46. print(arr.shape)
  47. # final_arr = []
  48. # for v in arr:
  49. # v_data = []
  50. # for vv in v:
  51. # #scaled_vv = np.array(vv, 'float') - np.mean(np.array(vv, 'float'))
  52. # #v_data.append(scaled_vv)
  53. # v_data.append(vv)
  54. # final_arr.append(v_data)
  55. final_arr = np.array(arr, 'float32')
  56. # check if sequence normalization is used
  57. if seq_norm:
  58. if final_arr.ndim > 2:
  59. n, s, f = final_arr.shape
  60. for index, seq in enumerate(final_arr):
  61. for i in range(f):
  62. final_arr[index][:, i] = utils.normalize_arr_with_range(seq[:, i])
  63. return final_arr
  64. def create_model(_input_shape):
  65. print ('Creating model...')
  66. model = Sequential()
  67. # model.add(Conv3D(60, (1, 2, 2), input_shape=input_shape))
  68. # model.add(Activation('relu'))
  69. # model.add(MaxPooling3D(pool_size=(1, 2, 2)))
  70. #model.add(Embedding(input_dim = 1000, output_dim = 50, input_length=input_length))
  71. # model.add(ConvLSTM2D(filters=40, kernel_size=(3, 3), input_shape=input_shape, units=256, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  72. # model.add(Dropout(0.4))
  73. # model.add(GRU(units=128, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  74. # model.add(Dropout(0.4))
  75. # model.add(Dense(1, activation='sigmoid'))
  76. model.add(ConvLSTM2D(filters=100, kernel_size=(3, 3),
  77. input_shape=_input_shape,
  78. dropout=0.5,
  79. #recurrent_dropout=0.5,
  80. padding='same', return_sequences=True))
  81. model.add(BatchNormalization())
  82. model.add(ConvLSTM2D(filters=30, kernel_size=(3, 3),
  83. dropout=0.5,
  84. #recurrent_dropout=0.5,
  85. padding='same', return_sequences=True))
  86. model.add(BatchNormalization())
  87. model.add(Dropout(0.5))
  88. model.add(Conv3D(filters=15, kernel_size=(3, 3, 3),
  89. activation='sigmoid',
  90. padding='same', data_format='channels_last'))
  91. model.add(Dropout(0.5))
  92. model.add(Flatten())
  93. model.add(Dense(512, activation='sigmoid'))
  94. model.add(Dropout(0.5))
  95. model.add(Dense(128, activation='sigmoid'))
  96. model.add(Dropout(0.5))
  97. model.add(Dense(1, activation='sigmoid'))
  98. model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  99. print ('Compiling...')
  100. # model.compile(loss='binary_crossentropy',
  101. # optimizer='rmsprop',
  102. # metrics=['accuracy'])
  103. return model
  104. def main():
  105. parser = argparse.ArgumentParser(description="Read and compute training of LSTM model")
  106. parser.add_argument('--train', type=str, help='input train dataset', required=True)
  107. parser.add_argument('--test', type=str, help='input test dataset', required=True)
  108. parser.add_argument('--output', type=str, help='output model name', required=True)
  109. parser.add_argument('--epochs', type=int, help='number of expected epochs', default=30)
  110. parser.add_argument('--batch_size', type=int, help='expected batch size for training model', default=64)
  111. parser.add_argument('--seq_norm', type=int, help='normalization sequence by features', choices=[0, 1], default=0)
  112. args = parser.parse_args()
  113. p_train = args.train
  114. p_test = args.test
  115. p_output = args.output
  116. p_epochs = args.epochs
  117. p_batch_size = args.batch_size
  118. p_seq_norm = bool(args.seq_norm)
  119. dataset_train = pd.read_csv(p_train, header=None, sep=';')
  120. dataset_test = pd.read_csv(p_test, header=None, sep=';')
  121. # getting weighted class over the whole dataset
  122. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  123. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  124. nb_noisy_train = len(noisy_df_train.index)
  125. nb_not_noisy_train = len(not_noisy_df_train.index)
  126. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  127. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  128. nb_noisy_test = len(noisy_df_test.index)
  129. nb_not_noisy_test = len(not_noisy_df_test.index)
  130. noisy_samples = nb_noisy_test + nb_noisy_train
  131. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  132. total_samples = noisy_samples + not_noisy_samples
  133. print('noisy', noisy_samples)
  134. print('not_noisy', not_noisy_samples)
  135. print('total', total_samples)
  136. class_weight = {
  137. 0: noisy_samples / float(total_samples),
  138. 1: (not_noisy_samples / float(total_samples)),
  139. }
  140. # shuffle data
  141. final_df_train = sklearn.utils.shuffle(dataset_train)
  142. final_df_test = sklearn.utils.shuffle(dataset_test)
  143. # split dataset into X_train, y_train, X_test, y_test
  144. X_train_all = final_df_train.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  145. X_train_all = build_input(X_train_all, p_seq_norm)
  146. y_train_all = final_df_train.loc[:, 0].astype('int')
  147. X_test = final_df_test.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  148. X_test = build_input(X_test, p_seq_norm)
  149. y_test = final_df_test.loc[:, 0].astype('int')
  150. input_shape = (X_train_all.shape[1], X_train_all.shape[2], X_train_all.shape[3], X_train_all.shape[4])
  151. print('Training data input shape', input_shape)
  152. model = create_model(input_shape)
  153. model.summary()
  154. # prepare train and validation dataset
  155. X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, shuffle=False)
  156. print("Fitting model with custom class_weight", class_weight)
  157. history = model.fit(X_train, y_train, batch_size=p_batch_size, epochs=p_epochs, validation_data=(X_val, y_val), verbose=1, shuffle=True, class_weight=class_weight)
  158. # list all data in history
  159. # print(history.history.keys())
  160. # # summarize history for accuracy
  161. # plt.plot(history.history['accuracy'])
  162. # plt.plot(history.history['val_accuracy'])
  163. # plt.title('model accuracy')
  164. # plt.ylabel('accuracy')
  165. # plt.xlabel('epoch')
  166. # plt.legend(['train', 'test'], loc='upper left')
  167. # plt.show()
  168. # # summarize history for loss
  169. # plt.plot(history.history['loss'])
  170. # plt.plot(history.history['val_loss'])
  171. # plt.title('model loss')
  172. # plt.ylabel('loss')
  173. # plt.xlabel('epoch')
  174. # plt.legend(['train', 'test'], loc='upper left')
  175. # plt.show()
  176. # train_score, train_acc = model.evaluate(X_train, y_train, batch_size=1)
  177. # print(train_acc)
  178. y_train_predict = model.predict_classes(X_train)
  179. y_val_predict = model.predict_classes(X_val)
  180. y_test_predict = model.predict_classes(X_test)
  181. print(y_train_predict)
  182. print(y_test_predict)
  183. auc_train = roc_auc_score(y_train, y_train_predict)
  184. auc_val = roc_auc_score(y_val, y_val_predict)
  185. auc_test = roc_auc_score(y_test, y_test_predict)
  186. acc_train = accuracy_score(y_train, y_train_predict)
  187. acc_val = accuracy_score(y_val, y_val_predict)
  188. acc_test = accuracy_score(y_test, y_test_predict)
  189. print('Train ACC:', acc_train)
  190. print('Train AUC', auc_train)
  191. print('Val ACC:', acc_val)
  192. print('Val AUC', auc_val)
  193. print('Test ACC:', acc_test)
  194. print('Test AUC:', auc_test)
  195. # save acc metric information
  196. plt.plot(history.history['accuracy'])
  197. plt.plot(history.history['val_accuracy'])
  198. plt.title('model accuracy')
  199. plt.ylabel('accuracy')
  200. plt.xlabel('epoch')
  201. plt.legend(['train', 'test'], loc='upper left')
  202. model_history = os.path.join(cfg.output_results_folder, p_output + '.png')
  203. plt.savefig(model_history)
  204. # save model using joblib
  205. if not os.path.exists(cfg.output_models):
  206. os.makedirs(cfg.output_models)
  207. dump(model, os.path.join(cfg.output_models, p_output + '.joblib'))
  208. # save model results
  209. if not os.path.exists(cfg.output_results_folder):
  210. os.makedirs(cfg.output_results_folder)
  211. results_filename_path = os.path.join(cfg.output_results_folder, cfg.results_filename)
  212. if not os.path.exists(results_filename_path):
  213. with open(results_filename_path, 'w') as f:
  214. f.write('name;train_acc;val_acc;test_acc;train_auc;val_auc;test_auc;\n')
  215. with open(results_filename_path, 'a') as f:
  216. f.write(p_output + ';' + str(acc_train) + ';' + str(acc_val) + ';' + str(acc_test) + ';' \
  217. + str(auc_train) + ';' + str(auc_val) + ';' + str(auc_test) + '\n')
  218. if __name__ == "__main__":
  219. main()