train_lstm_weighted.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. # main imports
  2. import argparse, sys
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import ctypes
  7. from PIL import Image
  8. from keras import backend as K
  9. import matplotlib.pyplot as plt
  10. from ipfml import utils
  11. # dl imports
  12. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization, ConvLSTM2D, Conv3D, Flatten
  13. from keras.preprocessing.sequence import pad_sequences
  14. from keras.models import Sequential
  15. from keras.models import load_model
  16. from keras.callbacks import ModelCheckpoint
  17. from sklearn.metrics import roc_auc_score, accuracy_score
  18. import tensorflow as tf
  19. from keras import backend as K
  20. import sklearn
  21. from sklearn.model_selection import train_test_split
  22. from joblib import dump
  23. import custom_config as cfg
  24. # global variables
  25. n_counter = 0
  26. total_samples = 0
  27. def write_progress(progress):
  28. '''
  29. Display progress information as progress bar
  30. '''
  31. barWidth = 180
  32. output_str = "["
  33. pos = barWidth * progress
  34. for i in range(barWidth):
  35. if i < pos:
  36. output_str = output_str + "="
  37. elif i == pos:
  38. output_str = output_str + ">"
  39. else:
  40. output_str = output_str + " "
  41. output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
  42. print(output_str)
  43. sys.stdout.write("\033[F")
  44. def build_input(df, seq_norm):
  45. """Convert dataframe to numpy array input with timesteps as float array
  46. Arguments:
  47. df: {pd.Dataframe} -- Dataframe input
  48. seq_norm: {bool} -- normalize or not seq input data by features
  49. Returns:
  50. {np.ndarray} -- input LSTM data as numpy array
  51. """
  52. global n_counter
  53. global total_samples
  54. arr = []
  55. # for each input line
  56. for row in df.iterrows():
  57. seq_arr = []
  58. # for each sequence data input
  59. for column in row[1]:
  60. seq_elems = []
  61. # for each element in sequence data
  62. for img_path in column:
  63. img = Image.open(img_path)
  64. # seq_elems.append(np.array(img).flatten())
  65. seq_elems.append(np.array(img) / 255.)
  66. #seq_arr.append(np.array(seq_elems).flatten())
  67. seq_arr.append(np.array(seq_elems))
  68. arr.append(seq_arr)
  69. # update progress
  70. n_counter += 1
  71. write_progress(n_counter / float(total_samples))
  72. arr = np.array(arr)
  73. print(arr.shape)
  74. # final_arr = []
  75. # for v in arr:
  76. # v_data = []
  77. # for vv in v:
  78. # #scaled_vv = np.array(vv, 'float') - np.mean(np.array(vv, 'float'))
  79. # #v_data.append(scaled_vv)
  80. # v_data.append(vv)
  81. # final_arr.append(v_data)
  82. final_arr = np.array(arr, 'float32')
  83. # check if sequence normalization is used
  84. if seq_norm:
  85. if final_arr.ndim > 2:
  86. n, s, f = final_arr.shape
  87. for index, seq in enumerate(final_arr):
  88. for i in range(f):
  89. final_arr[index][:, i] = utils.normalize_arr_with_range(seq[:, i])
  90. return final_arr
  91. def create_model(_input_shape):
  92. print ('Creating model...')
  93. model = Sequential()
  94. # model.add(Conv3D(60, (1, 2, 2), input_shape=input_shape))
  95. # model.add(Activation('relu'))
  96. # model.add(MaxPooling3D(pool_size=(1, 2, 2)))
  97. #model.add(Embedding(input_dim = 1000, output_dim = 50, input_length=input_length))
  98. # model.add(ConvLSTM2D(filters=40, kernel_size=(3, 3), input_shape=input_shape, units=256, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  99. # model.add(Dropout(0.4))
  100. # model.add(GRU(units=128, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  101. # model.add(Dropout(0.4))
  102. # model.add(Dense(1, activation='sigmoid'))
  103. model.add(ConvLSTM2D(filters=100, kernel_size=(3, 3),
  104. input_shape=_input_shape,
  105. dropout=0.5,
  106. #recurrent_dropout=0.5,
  107. padding='same', return_sequences=True))
  108. model.add(BatchNormalization())
  109. model.add(ConvLSTM2D(filters=30, kernel_size=(3, 3),
  110. dropout=0.5,
  111. #recurrent_dropout=0.5,
  112. padding='same', return_sequences=True))
  113. model.add(BatchNormalization())
  114. model.add(Dropout(0.5))
  115. model.add(Conv3D(filters=15, kernel_size=(3, 3, 3),
  116. activation='sigmoid',
  117. padding='same', data_format='channels_last'))
  118. model.add(Dropout(0.5))
  119. model.add(Flatten())
  120. model.add(Dense(512, activation='relu'))
  121. model.add(BatchNormalization())
  122. model.add(Dropout(0.5))
  123. model.add(Dense(128, activation='relu'))
  124. model.add(BatchNormalization())
  125. model.add(Dropout(0.5))
  126. model.add(Dense(1, activation='sigmoid'))
  127. model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  128. print ('-- Compiling...')
  129. return model
  130. def main():
  131. # get this variable as global
  132. global total_samples
  133. parser = argparse.ArgumentParser(description="Read and compute training of LSTM model")
  134. parser.add_argument('--train', type=str, help='input train dataset', required=True)
  135. parser.add_argument('--test', type=str, help='input test dataset', required=True)
  136. parser.add_argument('--output', type=str, help='output model name', required=True)
  137. parser.add_argument('--epochs', type=int, help='number of expected epochs', default=30)
  138. parser.add_argument('--batch_size', type=int, help='expected batch size for training model', default=64)
  139. parser.add_argument('--seq_norm', type=int, help='normalization sequence by features', choices=[0, 1], default=0)
  140. args = parser.parse_args()
  141. p_train = args.train
  142. p_test = args.test
  143. p_output = args.output
  144. p_epochs = args.epochs
  145. p_batch_size = args.batch_size
  146. p_seq_norm = bool(args.seq_norm)
  147. print('-----------------------------')
  148. print("----- Preparing data... -----")
  149. dataset_train = pd.read_csv(p_train, header=None, sep=';')
  150. dataset_test = pd.read_csv(p_test, header=None, sep=';')
  151. print("-- Train set size : ", len(dataset_train))
  152. print("-- Test set size : ", len(dataset_test))
  153. # getting weighted class over the whole dataset
  154. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  155. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  156. nb_noisy_train = len(noisy_df_train.index)
  157. nb_not_noisy_train = len(not_noisy_df_train.index)
  158. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  159. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  160. nb_noisy_test = len(noisy_df_test.index)
  161. nb_not_noisy_test = len(not_noisy_df_test.index)
  162. noisy_samples = nb_noisy_test + nb_noisy_train
  163. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  164. total_samples = noisy_samples + not_noisy_samples
  165. print('-----------------------------')
  166. print('---- Dataset information ----')
  167. print('-- noisy:', noisy_samples)
  168. print('-- not_noisy:', not_noisy_samples)
  169. print('-- total:', total_samples)
  170. print('-----------------------------')
  171. class_weight = {
  172. 0: noisy_samples / float(total_samples),
  173. 1: (not_noisy_samples / float(total_samples)),
  174. }
  175. # shuffle data
  176. final_df_train = sklearn.utils.shuffle(dataset_train)
  177. final_df_test = sklearn.utils.shuffle(dataset_test)
  178. print('---- Loading dataset.... ----')
  179. print('-----------------------------\n')
  180. # split dataset into X_train, y_train, X_test, y_test
  181. X_train_all = final_df_train.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  182. X_train_all = build_input(X_train_all, p_seq_norm)
  183. y_train_all = final_df_train.loc[:, 0].astype('int')
  184. X_test = final_df_test.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  185. X_test = build_input(X_test, p_seq_norm)
  186. y_test = final_df_test.loc[:, 0].astype('int')
  187. input_shape = (X_train_all.shape[1], X_train_all.shape[2], X_train_all.shape[3], X_train_all.shape[4])
  188. print('\n-----------------------------')
  189. print('-- Training data input shape', input_shape)
  190. print('-----------------------------')
  191. # create backup folder for current model
  192. model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
  193. if not os.path.exists(model_backup_folder):
  194. os.makedirs(model_backup_folder)
  195. # add of callback models
  196. filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-_{epoch:03d}.h5")
  197. checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=0, mode='max')
  198. callbacks_list = [checkpoint]
  199. # check if backup already exists
  200. backups = sorted(os.listdir(model_backup_folder))
  201. if len(backups) > 0:
  202. last_backup_file = backups[-1]
  203. model = load_model(last_backup_file)
  204. # get initial epoch
  205. initial_epoch = int(last_backup_file.split('_')[-1].replace('.h5', ''))
  206. print('-----------------------------')
  207. print('-- Restore model from backup...')
  208. print('-- Restart training @epoch:', initial_epoch)
  209. print('-----------------------------')
  210. else:
  211. model = create_model(input_shape)
  212. model.summary()
  213. # prepare train and validation dataset
  214. X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, shuffle=False)
  215. print("-- Fitting model with custom class_weight", class_weight)
  216. print('-----------------------------')
  217. history = model.fit(X_train, y_train, batch_size=p_batch_size, epochs=p_epochs, validation_data=(X_val, y_val), verbose=1, shuffle=True, class_weight=class_weight)
  218. # list all data in history
  219. # print(history.history.keys())
  220. # # summarize history for accuracy
  221. # plt.plot(history.history['accuracy'])
  222. # plt.plot(history.history['val_accuracy'])
  223. # plt.title('model accuracy')
  224. # plt.ylabel('accuracy')
  225. # plt.xlabel('epoch')
  226. # plt.legend(['train', 'test'], loc='upper left')
  227. # plt.show()
  228. # # summarize history for loss
  229. # plt.plot(history.history['loss'])
  230. # plt.plot(history.history['val_loss'])
  231. # plt.title('model loss')
  232. # plt.ylabel('loss')
  233. # plt.xlabel('epoch')
  234. # plt.legend(['train', 'test'], loc='upper left')
  235. # plt.show()
  236. # train_score, train_acc = model.evaluate(X_train, y_train, batch_size=1)
  237. # print(train_acc)
  238. y_train_predict = model.predict_classes(X_train)
  239. y_val_predict = model.predict_classes(X_val)
  240. y_test_predict = model.predict_classes(X_test)
  241. print(y_train_predict)
  242. print(y_test_predict)
  243. auc_train = roc_auc_score(y_train, y_train_predict)
  244. auc_val = roc_auc_score(y_val, y_val_predict)
  245. auc_test = roc_auc_score(y_test, y_test_predict)
  246. acc_train = accuracy_score(y_train, y_train_predict)
  247. acc_val = accuracy_score(y_val, y_val_predict)
  248. acc_test = accuracy_score(y_test, y_test_predict)
  249. print('Train ACC:', acc_train)
  250. print('Train AUC', auc_train)
  251. print('Val ACC:', acc_val)
  252. print('Val AUC', auc_val)
  253. print('Test ACC:', acc_test)
  254. print('Test AUC:', auc_test)
  255. # save acc metric information
  256. plt.plot(history.history['accuracy'])
  257. plt.plot(history.history['val_accuracy'])
  258. plt.title('model accuracy')
  259. plt.ylabel('accuracy')
  260. plt.xlabel('epoch')
  261. plt.legend(['train', 'test'], loc='upper left')
  262. model_history = os.path.join(cfg.output_results_folder, p_output + '.png')
  263. plt.savefig(model_history)
  264. # save model using joblib
  265. if not os.path.exists(cfg.output_models):
  266. os.makedirs(cfg.output_models)
  267. dump(model, os.path.join(cfg.output_models, p_output + '.joblib'))
  268. # save model results
  269. if not os.path.exists(cfg.output_results_folder):
  270. os.makedirs(cfg.output_results_folder)
  271. results_filename_path = os.path.join(cfg.output_results_folder, cfg.results_filename)
  272. if not os.path.exists(results_filename_path):
  273. with open(results_filename_path, 'w') as f:
  274. f.write('name;train_acc;val_acc;test_acc;train_auc;val_auc;test_auc;\n')
  275. with open(results_filename_path, 'a') as f:
  276. f.write(p_output + ';' + str(acc_train) + ';' + str(acc_val) + ';' + str(acc_test) + ';' \
  277. + str(auc_train) + ';' + str(auc_val) + ';' + str(auc_test) + '\n')
  278. if __name__ == "__main__":
  279. main()