train_lstm_weighted.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. # main imports
  2. import argparse, sys
  3. import numpy as np
  4. import pandas as pd
  5. import os
  6. import ctypes
  7. from PIL import Image
  8. import cv2
  9. from keras import backend as K
  10. import matplotlib.pyplot as plt
  11. from ipfml import utils
  12. # dl imports
  13. from keras.layers import Dense, Dropout, LSTM, Embedding, GRU, BatchNormalization, ConvLSTM2D, Conv3D, Flatten
  14. from keras.preprocessing.sequence import pad_sequences
  15. from keras.models import Sequential
  16. from keras.models import load_model
  17. from keras.callbacks import ModelCheckpoint
  18. from sklearn.metrics import roc_auc_score, accuracy_score
  19. import tensorflow as tf
  20. from keras import backend as K
  21. import sklearn
  22. from sklearn.model_selection import train_test_split
  23. from joblib import dump
  24. import config as cfg
  25. # global variables
  26. n_counter = 0
  27. total_samples = 0
  28. def write_progress(progress):
  29. '''
  30. Display progress information as progress bar
  31. '''
  32. barWidth = 180
  33. output_str = "["
  34. pos = barWidth * progress
  35. for i in range(barWidth):
  36. if i < pos:
  37. output_str = output_str + "="
  38. elif i == pos:
  39. output_str = output_str + ">"
  40. else:
  41. output_str = output_str + " "
  42. output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
  43. print(output_str)
  44. sys.stdout.write("\033[F")
  45. def build_input(df, seq_norm, p_chanels):
  46. """Convert dataframe to numpy array input with timesteps as float array
  47. Arguments:
  48. df: {pd.Dataframe} -- Dataframe input
  49. seq_norm: {bool} -- normalize or not seq input data by features
  50. Returns:
  51. {np.ndarray} -- input LSTM data as numpy array
  52. """
  53. global n_counter
  54. global total_samples
  55. arr = []
  56. # for each input line
  57. for row in df.iterrows():
  58. seq_arr = []
  59. # for each sequence data input
  60. for column in row[1]:
  61. seq_elems = []
  62. # for each element in sequence data
  63. for i, img_path in enumerate(column):
  64. # seq_elems.append(np.array(img).flatten())
  65. if p_chanels[i] > 1:
  66. img = cv2.imread(img_path)
  67. else:
  68. img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
  69. img = cv2.resize(img, (50, 50))
  70. # normalization of images
  71. seq_elems.append(np.array(img, 'float16') / 255.)
  72. #seq_arr.append(np.array(seq_elems).flatten())
  73. seq_arr.append(np.array(seq_elems))
  74. arr.append(seq_arr)
  75. # update progress
  76. n_counter += 1
  77. write_progress(n_counter / float(total_samples))
  78. arr = np.array(arr)
  79. print(arr.shape)
  80. # final_arr = []
  81. # for v in arr:
  82. # v_data = []
  83. # for vv in v:
  84. # #scaled_vv = np.array(vv, 'float') - np.mean(np.array(vv, 'float'))
  85. # #v_data.append(scaled_vv)
  86. # v_data.append(vv)
  87. # final_arr.append(v_data)
  88. final_arr = np.array(arr, 'float16')
  89. # check if sequence normalization is used
  90. if seq_norm:
  91. print('Starting data normalization\n')
  92. if final_arr.ndim > 2:
  93. n_counter = 0
  94. n, s, f, h, w = final_arr.shape
  95. for index, seq in enumerate(final_arr):
  96. # f is the number of chanels
  97. for i in range(f):
  98. # need to normalize pixel per pixel
  99. for x in range(h):
  100. for y in range(h):
  101. final_arr[index][:, i, x, y] = utils.normalize_arr_with_range(seq[:, i, x, y])
  102. # update progress
  103. n_counter += 1
  104. write_progress(n_counter / float(total_samples))
  105. return final_arr
  106. def create_model(_input_shape):
  107. print ('Creating model...')
  108. model = Sequential()
  109. # model.add(Conv3D(60, (1, 2, 2), input_shape=input_shape))
  110. # model.add(Activation('relu'))
  111. # model.add(MaxPooling3D(pool_size=(1, 2, 2)))
  112. #model.add(Embedding(input_dim = 1000, output_dim = 50, input_length=input_length))
  113. # model.add(ConvLSTM2D(filters=40, kernel_size=(3, 3), input_shape=input_shape, units=256, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  114. # model.add(Dropout(0.4))
  115. # model.add(GRU(units=128, activation='sigmoid', recurrent_activation='hard_sigmoid'))
  116. # model.add(Dropout(0.4))
  117. # model.add(Dense(1, activation='sigmoid'))
  118. model.add(ConvLSTM2D(filters=100, kernel_size=(3, 3),
  119. input_shape=_input_shape,
  120. dropout=0.5,
  121. #recurrent_dropout=0.5,
  122. padding='same', return_sequences=True))
  123. model.add(BatchNormalization())
  124. model.add(ConvLSTM2D(filters=30, kernel_size=(3, 3),
  125. dropout=0.5,
  126. #recurrent_dropout=0.5,
  127. padding='same', return_sequences=True))
  128. model.add(BatchNormalization())
  129. model.add(Dropout(0.5))
  130. model.add(Conv3D(filters=15, kernel_size=(3, 3, 3),
  131. activation='sigmoid',
  132. padding='same', data_format='channels_last'))
  133. model.add(Dropout(0.5))
  134. model.add(Flatten())
  135. model.add(Dense(128, activation='relu'))
  136. model.add(BatchNormalization())
  137. model.add(Dropout(0.5))
  138. model.add(Dense(32, activation='relu'))
  139. model.add(BatchNormalization())
  140. model.add(Dropout(0.5))
  141. model.add(Dense(1, activation='sigmoid'))
  142. model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  143. print ('-- Compiling...')
  144. return model
  145. def main():
  146. # get this variable as global
  147. global total_samples
  148. parser = argparse.ArgumentParser(description="Read and compute training of LSTM model")
  149. parser.add_argument('--train', type=str, help='input train dataset', required=True)
  150. parser.add_argument('--test', type=str, help='input test dataset', required=True)
  151. parser.add_argument('--output', type=str, help='output model name', required=True)
  152. parser.add_argument('--chanels', type=str, help="given number of ordered chanels (example: '1,3,3') for each element of window", required=True)
  153. parser.add_argument('--epochs', type=int, help='number of expected epochs', default=30)
  154. parser.add_argument('--batch_size', type=int, help='expected batch size for training model', default=64)
  155. parser.add_argument('--seq_norm', type=int, help='normalization sequence by features', choices=[0, 1], default=0)
  156. args = parser.parse_args()
  157. p_train = args.train
  158. p_test = args.test
  159. p_output = args.output
  160. p_chanels = list(map(int, args.chanels.split(',')))
  161. p_epochs = args.epochs
  162. p_batch_size = args.batch_size
  163. p_seq_norm = bool(args.seq_norm)
  164. print('-----------------------------')
  165. print("----- Preparing data... -----")
  166. dataset_train = pd.read_csv(p_train, header=None, sep=';')
  167. dataset_test = pd.read_csv(p_test, header=None, sep=';')
  168. print("-- Train set size : ", len(dataset_train))
  169. print("-- Test set size : ", len(dataset_test))
  170. # getting weighted class over the whole dataset
  171. noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
  172. not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
  173. nb_noisy_train = len(noisy_df_train.index)
  174. nb_not_noisy_train = len(not_noisy_df_train.index)
  175. noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
  176. not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
  177. nb_noisy_test = len(noisy_df_test.index)
  178. nb_not_noisy_test = len(not_noisy_df_test.index)
  179. noisy_samples = nb_noisy_test + nb_noisy_train
  180. not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
  181. total_samples = noisy_samples + not_noisy_samples
  182. print('-----------------------------')
  183. print('---- Dataset information ----')
  184. print('-- noisy:', noisy_samples)
  185. print('-- not_noisy:', not_noisy_samples)
  186. print('-- total:', total_samples)
  187. print('-----------------------------')
  188. class_weight = {
  189. 0: noisy_samples / float(total_samples),
  190. 1: (not_noisy_samples / float(total_samples)),
  191. }
  192. # shuffle data
  193. final_df_train = sklearn.utils.shuffle(dataset_train)
  194. final_df_test = sklearn.utils.shuffle(dataset_test)
  195. print('---- Loading dataset.... ----')
  196. print('-----------------------------\n')
  197. n_train_samples = len(final_df_train.index)
  198. n_test_samples = len(final_df_train.index)
  199. total_samples = n_train_samples
  200. n_counter = 0
  201. print('Loading train dataset\n')
  202. # split dataset into X_train, y_train, X_test, y_test
  203. X_train_all = final_df_train.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  204. X_train_all = build_input(X_train_all, p_seq_norm, p_chanels)
  205. y_train_all = final_df_train.loc[:, 0].astype('int')
  206. total_samples = n_test_samples
  207. n_counter = 0
  208. X_test = final_df_test.loc[:, 1:].apply(lambda x: x.astype(str).str.split('::'))
  209. X_test = build_input(X_test, p_seq_norm, p_chanels)
  210. y_test = final_df_test.loc[:, 0].astype('int')
  211. input_shape = (X_train_all.shape[1], X_train_all.shape[2], X_train_all.shape[3], X_train_all.shape[4])
  212. print('\n-----------------------------')
  213. print('-- Training data input shape', input_shape)
  214. print('-----------------------------')
  215. # create backup folder for current model
  216. model_backup_folder = os.path.join(cfg.backup_model_folder, p_output)
  217. if not os.path.exists(model_backup_folder):
  218. os.makedirs(model_backup_folder)
  219. # add of callback models
  220. filepath = os.path.join(cfg.backup_model_folder, p_output, p_output + "-_{epoch:03d}.h5")
  221. checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=0, mode='max')
  222. callbacks_list = [checkpoint]
  223. # check if backup already exists
  224. backups = sorted(os.listdir(model_backup_folder))
  225. if len(backups) > 0:
  226. last_backup_file = backups[-1]
  227. model = load_model(last_backup_file)
  228. # get initial epoch
  229. initial_epoch = int(last_backup_file.split('_')[-1].replace('.h5', ''))
  230. print('-----------------------------')
  231. print('-- Restore model from backup...')
  232. print('-- Restart training @epoch:', initial_epoch)
  233. print('-----------------------------')
  234. else:
  235. model = create_model(input_shape)
  236. model.summary()
  237. # prepare train and validation dataset
  238. X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.3, shuffle=False)
  239. print("-- Fitting model with custom class_weight", class_weight)
  240. print('-----------------------------')
  241. history = model.fit(X_train, y_train, batch_size=p_batch_size, epochs=p_epochs, validation_data=(X_val, y_val), verbose=1, shuffle=True, class_weight=class_weight)
  242. # list all data in history
  243. # print(history.history.keys())
  244. # # summarize history for accuracy
  245. # plt.plot(history.history['accuracy'])
  246. # plt.plot(history.history['val_accuracy'])
  247. # plt.title('model accuracy')
  248. # plt.ylabel('accuracy')
  249. # plt.xlabel('epoch')
  250. # plt.legend(['train', 'test'], loc='upper left')
  251. # plt.show()
  252. # # summarize history for loss
  253. # plt.plot(history.history['loss'])
  254. # plt.plot(history.history['val_loss'])
  255. # plt.title('model loss')
  256. # plt.ylabel('loss')
  257. # plt.xlabel('epoch')
  258. # plt.legend(['train', 'test'], loc='upper left')
  259. # plt.show()
  260. # train_score, train_acc = model.evaluate(X_train, y_train, batch_size=1)
  261. # print(train_acc)
  262. y_train_predict = model.predict(X_train, batch_size=1, verbose=1)
  263. y_val_predict = model.predict(X_val, batch_size=1, verbose=1)
  264. y_train_predict = [ 1 if l > 0.5 else 0 for l in y_train_predict ]
  265. y_val_predict = [ 1 if l > 0.5 else 0 for l in y_val_predict ]
  266. auc_train = roc_auc_score(y_train, y_train_predict)
  267. auc_val = roc_auc_score(y_val, y_val_predict)
  268. acc_train = accuracy_score(y_train, y_train_predict)
  269. acc_val = accuracy_score(y_val, y_val_predict)
  270. y_test_predict = model.predict(X_test, batch_size=1, verbose=1)
  271. y_test_predict = [ 1 if l > 0.5 else 0 for l in y_test_predict ]
  272. acc_test = accuracy_score(y_test, y_test_predict)
  273. auc_test = roc_auc_score(y_test, y_test_predict)
  274. print('Train ACC:', acc_train)
  275. print('Train AUC', auc_train)
  276. print('Val ACC:', acc_val)
  277. print('Val AUC', auc_val)
  278. print('Test ACC:', acc_test)
  279. print('Test AUC:', auc_test)
  280. # save acc metric information
  281. plt.plot(history.history['accuracy'])
  282. plt.plot(history.history['val_accuracy'])
  283. plt.title('model accuracy')
  284. plt.ylabel('accuracy')
  285. plt.xlabel('epoch')
  286. plt.legend(['train', 'test'], loc='upper left')
  287. model_history = os.path.join(cfg.output_results_folder, p_output + '.png')
  288. if not os.path.exists(cfg.output_results_folder):
  289. os.makedirs(cfg.output_results_folder)
  290. plt.savefig(model_history)
  291. # save model using keras API
  292. if not os.path.exists(cfg.output_models):
  293. os.makedirs(cfg.output_models)
  294. model.save(os.path.join(cfg.output_models, p_output + '.h5'))
  295. # save model results
  296. if not os.path.exists(cfg.output_results_folder):
  297. os.makedirs(cfg.output_results_folder)
  298. results_filename_path = os.path.join(cfg.output_results_folder, cfg.results_filename)
  299. if not os.path.exists(results_filename_path):
  300. with open(results_filename_path, 'w') as f:
  301. f.write('name;train_acc;val_acc;test_acc;train_auc;val_auc;test_auc;\n')
  302. with open(results_filename_path, 'a') as f:
  303. f.write(p_output + ';' + str(acc_train) + ';' + str(acc_val) + ';' + str(acc_test) + ';' \
  304. + str(auc_train) + ';' + str(auc_val) + ';' + str(auc_test) + '\n')
  305. if __name__ == "__main__":
  306. main()