save_model_result_in_md_maxwell.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. from sklearn.utils import shuffle
  2. from sklearn.externals import joblib
  3. from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
  4. from sklearn.model_selection import cross_val_score
  5. from sklearn.model_selection import StratifiedKFold
  6. from sklearn.model_selection import train_test_split
  7. from keras.models import Sequential
  8. from keras.layers import Conv1D, MaxPooling1D
  9. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  10. from keras.wrappers.scikit_learn import KerasClassifier
  11. from keras import backend as K
  12. from keras.models import model_from_json
  13. import numpy as np
  14. import pandas as pd
  15. from ipfml import processing
  16. from PIL import Image
  17. import sys, os, getopt
  18. import subprocess
  19. import time
  20. import json
  21. from modules.utils import config as cfg
  22. threshold_map_folder = cfg.threshold_map_folder
  23. threshold_map_file_prefix = cfg.threshold_map_folder + "_"
  24. markdowns_folder = cfg.models_information_folder
  25. final_csv_model_comparisons = cfg.csv_model_comparisons_filename
  26. models_name = cfg.models_names_list
  27. zones = cfg.zones_indices
  28. current_dirpath = os.getcwd()
  29. def main():
  30. kind_model = 'keras'
  31. model_ext = ''
  32. if len(sys.argv) <= 1:
  33. print('Run with default parameters...')
  34. print('python save_model_result_in_md.py --interval "0,20" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  35. sys.exit(2)
  36. try:
  37. opts, args = getopt.getopt(sys.argv[1:], "ht:m:o:l", ["help=", "interval=", "model=", "mode=", "metric="])
  38. except getopt.GetoptError:
  39. # print help information and exit:
  40. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  41. sys.exit(2)
  42. for o, a in opts:
  43. if o == "-h":
  44. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  45. sys.exit()
  46. elif o in ("-t", "--interval"):
  47. p_interval = list(map(int, a.split(',')))
  48. elif o in ("-m", "--model"):
  49. p_model_file = a
  50. elif o in ("-o", "--mode"):
  51. p_mode = a
  52. if p_mode != 'svdn' and p_mode != 'svdne' and p_mode != 'svd':
  53. assert False, "Mode not recognized"
  54. elif o in ("-m", "--metric"):
  55. p_metric = a
  56. else:
  57. assert False, "unhandled option"
  58. # call model and get global result in scenes
  59. begin, end = p_interval
  60. bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
  61. print(bash_cmd)
  62. ## call command ##
  63. p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
  64. (output, err) = p.communicate()
  65. ## Wait for result ##
  66. p_status = p.wait()
  67. if not os.path.exists(markdowns_folder):
  68. os.makedirs(markdowns_folder)
  69. # get model name to construct model
  70. if '.joblib' in p_model_file:
  71. kind_model = 'sklearn'
  72. model_ext = '.joblib'
  73. if '.json' in p_model_file:
  74. kind_model = 'keras'
  75. model_ext = '.json'
  76. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace(model_ext, '.md'))
  77. with open(md_model_path, 'w') as f:
  78. f.write(output.decode("utf-8"))
  79. # read each threshold_map information if exists
  80. model_map_info_path = os.path.join(threshold_map_folder, p_model_file.replace('saved_models/', ''))
  81. if not os.path.exists(model_map_info_path):
  82. f.write('\n\n No threshold map information')
  83. else:
  84. maps_files = os.listdir(model_map_info_path)
  85. # get all map information
  86. for t_map_file in maps_files:
  87. file_path = os.path.join(model_map_info_path, t_map_file)
  88. with open(file_path, 'r') as map_file:
  89. title_scene = t_map_file.replace(threshold_map_file_prefix, '')
  90. f.write('\n\n## ' + title_scene + '\n')
  91. content = map_file.readlines()
  92. # getting each map line information
  93. for line in content:
  94. f.write(line)
  95. f.close()
  96. # Keep model information to compare
  97. current_model_name = p_model_file.split('/')[-1].replace(model_ext, '')
  98. # Prepare writing in .csv file
  99. output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
  100. output_final_file = open(output_final_file_path, "a")
  101. print(current_model_name)
  102. # reconstruct data filename
  103. for name in models_name:
  104. if name in current_model_name:
  105. data_filename = current_model_name
  106. current_data_file_path = os.path.join('data', data_filename)
  107. print("Current data file ")
  108. print(current_data_file_path)
  109. model_scores = []
  110. ########################
  111. # 1. Get and prepare data
  112. ########################
  113. dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
  114. dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
  115. # default first shuffle of data
  116. dataset_train = shuffle(dataset_train)
  117. dataset_test = shuffle(dataset_test)
  118. # get dataset with equal number of classes occurences
  119. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  120. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  121. nb_noisy_train = len(noisy_df_train.index)
  122. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  123. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  124. nb_noisy_test = len(noisy_df_test.index)
  125. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  126. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  127. # shuffle data another time
  128. final_df_train = shuffle(final_df_train)
  129. final_df_test = shuffle(final_df_test)
  130. final_df_train_size = len(final_df_train.index)
  131. final_df_test_size = len(final_df_test.index)
  132. # use of the whole data set for training
  133. x_dataset_train = final_df_train.ix[:,1:]
  134. x_dataset_test = final_df_test.ix[:,1:]
  135. y_dataset_train = final_df_train.ix[:,0]
  136. y_dataset_test = final_df_test.ix[:,0]
  137. #######################
  138. # 2. Getting model
  139. #######################
  140. if kind_model == 'keras':
  141. with open(p_model_file, 'r') as f:
  142. json_model = json.load(f)
  143. model = model_from_json(json_model)
  144. model.load_weights(p_model_file.replace('.json', '.h5'))
  145. model.compile(loss='binary_crossentropy',
  146. optimizer='adam',
  147. metrics=['accuracy'])
  148. # reshape all input data
  149. x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), end, 1)
  150. x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), end, 1)
  151. if kind_model == 'sklearn':
  152. model = joblib.load(p_model_file)
  153. #######################
  154. # 3. Fit model : use of cross validation to fit model
  155. #######################
  156. if kind_model == 'keras':
  157. model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
  158. if kind_model == 'sklearn':
  159. model.fit(x_dataset_train, y_dataset_train)
  160. train_accuracy = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  161. ######################
  162. # 4. Test : Validation and test dataset from .test dataset
  163. ######################
  164. # we need to specify validation size to 20% of whole dataset
  165. val_set_size = int(final_df_train_size/3)
  166. test_set_size = val_set_size
  167. total_validation_size = val_set_size + test_set_size
  168. if final_df_test_size > total_validation_size:
  169. x_dataset_test = x_dataset_test[0:total_validation_size]
  170. y_dataset_test = y_dataset_test[0:total_validation_size]
  171. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  172. if kind_model == 'keras':
  173. y_test_model = model.predict_classes(X_test)
  174. y_val_model = model.predict_classes(X_val)
  175. y_train_model = model.predict_classes(x_dataset_train)
  176. train_accuracy = accuracy_score(y_dataset_train, y_train_model)
  177. if kind_model == 'sklearn':
  178. y_test_model = model.predict(X_test)
  179. y_val_model = model.predict(X_val)
  180. y_train_model = model.predict(x_dataset_train)
  181. val_accuracy = accuracy_score(y_val, y_val_model)
  182. test_accuracy = accuracy_score(y_test, y_test_model)
  183. train_f1 = f1_score(y_dataset_train, y_train_model)
  184. train_recall = recall_score(y_dataset_train, y_train_model)
  185. train_roc_auc = roc_auc_score(y_dataset_train, y_train_model)
  186. val_f1 = f1_score(y_val, y_val_model)
  187. val_recall = recall_score(y_val, y_val_model)
  188. val_roc_auc = roc_auc_score(y_val, y_val_model)
  189. test_f1 = f1_score(y_test, y_test_model)
  190. test_recall = recall_score(y_test, y_test_model)
  191. test_roc_auc = roc_auc_score(y_test, y_test_model)
  192. if kind_model == 'keras':
  193. # stats of all dataset
  194. all_x_data = np.concatenate([x_dataset_train, X_test, X_val])
  195. all_y_data = np.concatenate([y_dataset_train, y_test, y_val])
  196. all_y_model = model.predict_classes(all_x_data)
  197. if kind_model == 'sklearn':
  198. # stats of all dataset
  199. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  200. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  201. all_y_model = model.predict(all_x_data)
  202. all_accuracy = accuracy_score(all_y_data, all_y_model)
  203. all_f1_score = f1_score(all_y_data, all_y_model)
  204. all_recall_score = recall_score(all_y_data, all_y_model)
  205. all_roc_auc_score = roc_auc_score(all_y_data, all_y_model)
  206. # stats of dataset sizes
  207. total_samples = final_df_train_size + val_set_size + test_set_size
  208. model_scores.append(final_df_train_size)
  209. model_scores.append(val_set_size)
  210. model_scores.append(test_set_size)
  211. model_scores.append(final_df_train_size / total_samples)
  212. model_scores.append(val_set_size / total_samples)
  213. model_scores.append(test_set_size / total_samples)
  214. # add of scores
  215. model_scores.append(train_accuracy)
  216. model_scores.append(val_accuracy)
  217. model_scores.append(test_accuracy)
  218. model_scores.append(all_accuracy)
  219. model_scores.append(train_f1)
  220. model_scores.append(train_recall)
  221. model_scores.append(train_roc_auc)
  222. model_scores.append(val_f1)
  223. model_scores.append(val_recall)
  224. model_scores.append(val_roc_auc)
  225. model_scores.append(test_f1)
  226. model_scores.append(test_recall)
  227. model_scores.append(test_roc_auc)
  228. model_scores.append(all_f1_score)
  229. model_scores.append(all_recall_score)
  230. model_scores.append(all_roc_auc_score)
  231. # TODO : improve...
  232. # check if it's always the case...
  233. nb_zones = current_data_file_path.split('_')[7]
  234. final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
  235. for s in model_scores:
  236. final_file_line += '; ' + str(s)
  237. output_final_file.write(final_file_line + '\n')
  238. if __name__== "__main__":
  239. main()