train_lstm_weighted.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # main imports
  2. import argparse
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import ctypes
  7. from PIL import Image
  8. from keras import backend as K
  9. import matplotlib.pyplot as plt
  10. from ipfml import utils
  11. # dl imports
  12. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization, ConvLSTM2D, Conv3D, Flatten
  13. from keras.preprocessing.sequence import pad_sequences
  14. from keras.models import Sequential
  15. from sklearn.metrics import roc_auc_score, accuracy_score
  16. import tensorflow as tf
  17. from keras import backend as K
  18. import sklearn
  19. from joblib import dump
  20. import custom_config as cfg
  21. def build_input(df, seq_norm):
  22. """Convert dataframe to numpy array input with timesteps as float array
  23. Arguments:
  24. df: {pd.Dataframe} -- Dataframe input
  25. seq_norm: {bool} -- normalize or not seq input data by features
  26. Returns:
  27. {np.ndarray} -- input LSTM data as numpy array
  28. """
  29. arr = []
  30. # for each input line
  31. for row in df.iterrows():
  32. seq_arr = []
  33. # for each sequence data input
  34. for column in row[1]:
  35. seq_elems = []
  36. # for each element in sequence data
  37. for img_path in column:
  38. img = Image.open(img_path)
  39. # seq_elems.append(np.array(img).flatten())
  40. seq_elems.append(np.array(img))
  41. #seq_arr.append(np.array(seq_elems).flatten())
  42. seq_arr.append(np.array(seq_elems))
  43. arr.append(seq_arr)
  44. arr = np.array(arr)
  45. print(arr.shape)
  46. # final_arr = []
  47. # for v in arr:
  48. # v_data = []
  49. # for vv in v:
  50. # #scaled_vv = np.array(vv, 'float') - np.mean(np.array(vv, 'float'))
  51. # #v_data.append(scaled_vv)
  52. # v_data.append(vv)
  53. # final_arr.append(v_data)
  54. final_arr = np.array(arr, 'float32')
  55. # check if sequence normalization is used
  56. if seq_norm:
  57. if final_arr.ndim > 2:
  58. n, s, f = final_arr.shape
  59. for index, seq in enumerate(final_arr):
  60. for i in range(f):
  61. final_arr[index][:, i] = utils.normalize_arr_with_range(seq[:, i])
  62. return final_arr
  63. def create_model(_input_shape):
  64. print ('Creating model...')
  65. model = Sequential()
  66. # model.add(Conv3D(60, (1, 2, 2), input_shape=input_shape))
  67. # model.add(Activation('relu'))
  68. # model.add(MaxPooling3D(pool_size=(1, 2, 2)))
  69. #model.add(Embedding(input_dim = 1000, output_dim = 50, input_length=input_length))
  70. # model.add(ConvLSTM2D(filters=40, kernel_size=(3, 3), input_shape=input_shape, units=256, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  71. # model.add(Dropout(0.4))
  72. # model.add(GRU(units=128, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  73. # model.add(Dropout(0.4))
  74. # model.add(Dense(1, activation='sigmoid'))
  75. model.add(ConvLSTM2D(filters=100, kernel_size=(3, 3),
  76. input_shape=_input_shape,
  77. padding='same', return_sequences=True))
  78. model.add(BatchNormalization())
  79. model.add(Dropout(0.4))
  80. model.add(ConvLSTM2D(filters=50, kernel_size=(3, 3),
  81. padding='same', return_sequences=True))
  82. model.add(BatchNormalization())
  83. model.add(Dropout(0.4))
  84. model.add(Conv3D(filters=20, kernel_size=(3, 3, 3),
  85. activation='sigmoid',
  86. padding='same', data_format='channels_last'))
  87. model.add(Dropout(0.4))
  88. model.add(Flatten())
  89. model.add(Dense(512, activation='sigmoid'))
  90. model.add(Dropout(0.4))
  91. model.add(Dense(128, activation='sigmoid'))
  92. model.add(Dropout(0.4))
  93. model.add(Dense(1, activation='sigmoid'))
  94. model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
  95. print ('Compiling...')
  96. # model.compile(loss='binary_crossentropy',
  97. # optimizer='rmsprop',
  98. # metrics=['accuracy'])
  99. return model
  100. def main():
  101. parser = argparse.ArgumentParser(description="Read and compute training of LSTM model")
  102. parser.add_argument('--train', type=str, help='input train dataset')
  103. parser.add_argument('--test', type=str, help='input test dataset')
  104. parser.add_argument('--output', type=str, help='output model name')
  105. parser.add_argument('--seq_norm', type=int, help='normalization sequence by features', choices=[0, 1])
  106. args = parser.parse_args()
  107. p_train = args.train
  108. p_test = args.test
  109. p_output = args.output
  110. p_seq_norm = bool(args.seq_norm)
  111. dataset_train = pd.read_csv(p_train, header=None, sep=';')
  112. dataset_test = pd.read_csv(p_test, header=None, sep=';')
  113. # getting weighted class over the whole dataset
  114. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  115. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  116. nb_noisy_train = len(noisy_df_train.index)
  117. nb_not_noisy_train = len(not_noisy_df_train.index)
  118. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  119. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  120. nb_noisy_test = len(noisy_df_test.index)
  121. nb_not_noisy_test = len(not_noisy_df_test.index)
  122. noisy_samples = nb_noisy_test + nb_noisy_train
  123. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  124. total_samples = noisy_samples + not_noisy_samples
  125. print('noisy', noisy_samples)
  126. print('not_noisy', not_noisy_samples)
  127. print('total', total_samples)
  128. class_weight = {
  129. 0: noisy_samples / float(total_samples),
  130. 1: (not_noisy_samples / float(total_samples)),
  131. }
  132. # shuffle data
  133. final_df_train = sklearn.utils.shuffle(dataset_train)
  134. final_df_test = sklearn.utils.shuffle(dataset_test)
  135. # split dataset into X_train, y_train, X_test, y_test
  136. X_train = final_df_train.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  137. X_train = build_input(X_train, p_seq_norm)
  138. y_train = final_df_train.loc[:, 0].astype('int')
  139. X_test = final_df_test.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  140. X_test = build_input(X_test, p_seq_norm)
  141. y_test = final_df_test.loc[:, 0].astype('int')
  142. X_all = np.concatenate([X_train, X_test])
  143. y_all = np.concatenate([y_train, y_test])
  144. input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3], X_train.shape[4])
  145. print('Training data input shape', input_shape)
  146. model = create_model(input_shape)
  147. model.summary()
  148. print("Fitting model with custom class_weight", class_weight)
  149. history = model.fit(X_train, y_train, batch_size=16, epochs=3, validation_split = 0.30, verbose=1, shuffle=True, class_weight=class_weight)
  150. # list all data in history
  151. # print(history.history.keys())
  152. # # summarize history for accuracy
  153. # plt.plot(history.history['accuracy'])
  154. # plt.plot(history.history['val_accuracy'])
  155. # plt.title('model accuracy')
  156. # plt.ylabel('accuracy')
  157. # plt.xlabel('epoch')
  158. # plt.legend(['train', 'test'], loc='upper left')
  159. # plt.show()
  160. # # summarize history for loss
  161. # plt.plot(history.history['loss'])
  162. # plt.plot(history.history['val_loss'])
  163. # plt.title('model loss')
  164. # plt.ylabel('loss')
  165. # plt.xlabel('epoch')
  166. # plt.legend(['train', 'test'], loc='upper left')
  167. # plt.show()
  168. # train_score, train_acc = model.evaluate(X_train, y_train, batch_size=1)
  169. # print(train_acc)
  170. y_train_predict = model.predict_classes(X_train)
  171. y_test_predict = model.predict_classes(X_test)
  172. y_all_predict = model.predict_classes(X_all)
  173. print(y_train_predict)
  174. print(y_test_predict)
  175. auc_train = roc_auc_score(y_train, y_train_predict)
  176. auc_test = roc_auc_score(y_test, y_test_predict)
  177. auc_all = roc_auc_score(y_all, y_all_predict)
  178. acc_train = accuracy_score(y_train, y_train_predict)
  179. acc_test = accuracy_score(y_test, y_test_predict)
  180. acc_all = accuracy_score(y_all, y_all_predict)
  181. print('Train ACC:', acc_train)
  182. print('Train AUC', auc_train)
  183. print('Test ACC:', acc_test)
  184. print('Test AUC:', auc_test)
  185. print('All ACC:', acc_all)
  186. print('All AUC:', auc_all)
  187. # save model results
  188. if not os.path.exists(cfg.output_results_folder):
  189. os.makedirs(cfg.output_results_folder)
  190. results_filename = os.path.join(cfg.output_results_folder, cfg.results_filename)
  191. with open(results_filename, 'a') as f:
  192. f.write(p_output + ';' + str(acc_train) + ';' + str(auc_train) + ';' + str(acc_test) + ';' + str(auc_test) + '\n')
  193. # save acc metric information
  194. plt.plot(history.history['accuracy'])
  195. plt.plot(history.history['val_accuracy'])
  196. plt.title('model accuracy')
  197. plt.ylabel('accuracy')
  198. plt.xlabel('epoch')
  199. plt.legend(['train', 'test'], loc='upper left')
  200. model_history = os.path.join(cfg.output_results_folder, p_output + '.png')
  201. plt.savefig(model_history)
  202. # save model using joblib
  203. if not os.path.exists(cfg.output_models):
  204. os.makedirs(cfg.output_models)
  205. dump(model, os.path.join(cfg.output_models, p_output + '.joblib'))
  206. if __name__ == "__main__":
  207. main()