save_model_result_in_md_maxwell.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. from sklearn.utils import shuffle
  2. from sklearn.externals import joblib
  3. from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
  4. from sklearn.model_selection import cross_val_score
  5. from sklearn.model_selection import StratifiedKFold
  6. from sklearn.model_selection import train_test_split
  7. from keras.models import Sequential
  8. from keras.layers import Conv1D, MaxPooling1D
  9. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  10. from keras.wrappers.scikit_learn import KerasClassifier
  11. from keras import backend as K
  12. from keras.models import model_from_json
  13. import numpy as np
  14. import pandas as pd
  15. from ipfml import processing
  16. from PIL import Image
  17. import sys, os, getopt
  18. import subprocess
  19. import time
  20. import json
  21. from modules.utils import config as cfg
  22. threshold_map_folder = cfg.threshold_map_folder
  23. threshold_map_file_prefix = cfg.threshold_map_folder + "_"
  24. markdowns_folder = cfg.models_information_folder
  25. final_csv_model_comparisons = cfg.csv_model_comparisons_filename
  26. models_name = cfg.models_names_list
  27. zones = cfg.zones_indices
  28. current_dirpath = os.getcwd()
  29. def main():
  30. kind_model = 'keras'
  31. model_ext = ''
  32. if len(sys.argv) <= 1:
  33. print('Run with default parameters...')
  34. print('python save_model_result_in_md.py --interval "0,20" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  35. sys.exit(2)
  36. try:
  37. opts, args = getopt.getopt(sys.argv[1:], "ht:m:o:l", ["help=", "interval=", "model=", "mode=", "metric="])
  38. except getopt.GetoptError:
  39. # print help information and exit:
  40. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  41. sys.exit(2)
  42. for o, a in opts:
  43. if o == "-h":
  44. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  45. sys.exit()
  46. elif o in ("-t", "--interval"):
  47. p_interval = list(map(int, a.split(',')))
  48. elif o in ("-m", "--model"):
  49. p_model_file = a
  50. elif o in ("-o", "--mode"):
  51. p_mode = a
  52. if p_mode != 'svdn' and p_mode != 'svdne' and p_mode != 'svd':
  53. assert False, "Mode not recognized"
  54. elif o in ("-m", "--metric"):
  55. p_metric = a
  56. else:
  57. assert False, "unhandled option"
  58. # call model and get global result in scenes
  59. begin, end = p_interval
  60. bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
  61. print(bash_cmd)
  62. ## call command ##
  63. p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
  64. (output, err) = p.communicate()
  65. ## Wait for result ##
  66. p_status = p.wait()
  67. if not os.path.exists(markdowns_folder):
  68. os.makedirs(markdowns_folder)
  69. # get model name to construct model
  70. if '.joblib' in p_model_file:
  71. kind_model = 'sklearn'
  72. model_ext = '.joblib'
  73. if '.json' in p_model_file:
  74. kind_model = 'keras'
  75. model_ext = '.json'
  76. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace(model_ext, '.md'))
  77. with open(md_model_path, 'w') as f:
  78. f.write(output.decode("utf-8"))
  79. # read each threshold_map information if exists
  80. model_map_info_path = os.path.join(threshold_map_folder, p_model_file.replace('saved_models/', ''))
  81. if not os.path.exists(model_map_info_path):
  82. f.write('\n\n No threshold map information')
  83. else:
  84. maps_files = os.listdir(model_map_info_path)
  85. # get all map information
  86. for t_map_file in maps_files:
  87. file_path = os.path.join(model_map_info_path, t_map_file)
  88. with open(file_path, 'r') as map_file:
  89. title_scene = t_map_file.replace(threshold_map_file_prefix, '')
  90. f.write('\n\n## ' + title_scene + '\n')
  91. content = map_file.readlines()
  92. # getting each map line information
  93. for line in content:
  94. f.write(line)
  95. f.close()
  96. # Keep model information to compare
  97. current_model_name = p_model_file.split('/')[-1].replace(model_ext, '')
  98. # Prepare writing in .csv file
  99. output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
  100. output_final_file = open(output_final_file_path, "a")
  101. print(current_model_name)
  102. # reconstruct data filename
  103. for name in models_name:
  104. if name in current_model_name:
  105. current_data_file_path = os.path.join('data', current_model_name.replace(name, 'data_maxwell'))
  106. print("Current data file ")
  107. print(current_data_file_path)
  108. model_scores = []
  109. ########################
  110. # 1. Get and prepare data
  111. ########################
  112. dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
  113. dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
  114. # default first shuffle of data
  115. dataset_train = shuffle(dataset_train)
  116. dataset_test = shuffle(dataset_test)
  117. # get dataset with equal number of classes occurences
  118. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  119. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  120. nb_noisy_train = len(noisy_df_train.index)
  121. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  122. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  123. nb_noisy_test = len(noisy_df_test.index)
  124. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  125. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  126. # shuffle data another time
  127. final_df_train = shuffle(final_df_train)
  128. final_df_test = shuffle(final_df_test)
  129. final_df_train_size = len(final_df_train.index)
  130. final_df_test_size = len(final_df_test.index)
  131. # use of the whole data set for training
  132. x_dataset_train = final_df_train.ix[:,1:]
  133. x_dataset_test = final_df_test.ix[:,1:]
  134. y_dataset_train = final_df_train.ix[:,0]
  135. y_dataset_test = final_df_test.ix[:,0]
  136. #######################
  137. # 2. Getting model
  138. #######################
  139. if kind_model == 'keras':
  140. with open(p_model_file, 'r') as f:
  141. json_model = json.load(f)
  142. model = model_from_json(json_model)
  143. model.load_weights(p_model_file.replace('.json', '.h5'))
  144. model.compile(loss='binary_crossentropy',
  145. optimizer='adam',
  146. metrics=['accuracy'])
  147. # reshape all input data
  148. x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), end, 1)
  149. x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), end, 1)
  150. if kind_model == 'sklearn':
  151. model = joblib.load(p_model_file)
  152. #######################
  153. # 3. Fit model : use of cross validation to fit model
  154. #######################
  155. if kind_model == 'keras':
  156. model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
  157. if kind_model == 'sklearn':
  158. model.fit(x_dataset_train, y_dataset_train)
  159. train_accuracy = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  160. ######################
  161. # 4. Test : Validation and test dataset from .test dataset
  162. ######################
  163. # we need to specify validation size to 20% of whole dataset
  164. val_set_size = int(final_df_train_size/3)
  165. test_set_size = val_set_size
  166. total_validation_size = val_set_size + test_set_size
  167. if final_df_test_size > total_validation_size:
  168. x_dataset_test = x_dataset_test[0:total_validation_size]
  169. y_dataset_test = y_dataset_test[0:total_validation_size]
  170. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  171. if kind_model == 'keras':
  172. y_test_model = model.predict_classes(X_test)
  173. y_val_model = model.predict_classes(X_val)
  174. y_train_model = model.predict_classes(x_dataset_train)
  175. train_accuracy = accuracy_score(y_dataset_train, y_train_model)
  176. if kind_model == 'sklearn':
  177. y_test_model = model.predict(X_test)
  178. y_val_model = model.predict(X_val)
  179. y_train_model = model.predict(x_dataset_train)
  180. val_accuracy = accuracy_score(y_val, y_val_model)
  181. test_accuracy = accuracy_score(y_test, y_test_model)
  182. train_f1 = f1_score(y_dataset_train, y_train_model)
  183. train_recall = recall_score(y_dataset_train, y_train_model)
  184. train_roc_auc = roc_auc_score(y_dataset_train, y_train_model)
  185. val_f1 = f1_score(y_val, y_val_model)
  186. val_recall = recall_score(y_val, y_val_model)
  187. val_roc_auc = roc_auc_score(y_val, y_val_model)
  188. test_f1 = f1_score(y_test, y_test_model)
  189. test_recall = recall_score(y_test, y_test_model)
  190. test_roc_auc = roc_auc_score(y_test, y_test_model)
  191. # stats of all dataset
  192. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  193. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  194. if kind_model == 'keras':
  195. # stats of all dataset
  196. all_x_data = pd.concat([pd.DataFrame.from_records(x_dataset_train), X_test, X_val])
  197. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  198. all_y_model = model.predict_classes(all_x_data)
  199. if kind_model == 'sklearn':
  200. # stats of all dataset
  201. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  202. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  203. all_y_model = model.predict(all_x_data)
  204. all_accuracy = accuracy_score(all_y_data, all_y_model)
  205. all_f1_score = f1_score(all_y_data, all_y_model)
  206. all_recall_score = recall_score(all_y_data, all_y_model)
  207. all_roc_auc_score = roc_auc_score(all_y_data, all_y_model)
  208. # stats of dataset sizes
  209. total_samples = final_df_train_size + val_set_size + test_set_size
  210. model_scores.append(final_df_train_size)
  211. model_scores.append(val_set_size)
  212. model_scores.append(test_set_size)
  213. model_scores.append(final_df_train_size / total_samples)
  214. model_scores.append(val_set_size / total_samples)
  215. model_scores.append(test_set_size / total_samples)
  216. # add of scores
  217. model_scores.append(train_accuracy)
  218. model_scores.append(val_accuracy)
  219. model_scores.append(test_accuracy)
  220. model_scores.append(all_accuracy)
  221. model_scores.append(train_f1)
  222. model_scores.append(train_recall)
  223. model_scores.append(train_roc_auc)
  224. model_scores.append(val_f1)
  225. model_scores.append(val_recall)
  226. model_scores.append(val_roc_auc)
  227. model_scores.append(test_f1)
  228. model_scores.append(test_recall)
  229. model_scores.append(test_roc_auc)
  230. model_scores.append(all_f1_score)
  231. model_scores.append(all_recall_score)
  232. model_scores.append(all_roc_auc_score)
  233. # TODO : improve...
  234. # check if it's always the case...
  235. nb_zones = current_data_file_path.split('_')[7]
  236. final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
  237. for s in model_scores:
  238. final_file_line += '; ' + str(s)
  239. output_final_file.write(final_file_line + '\n')
  240. if __name__== "__main__":
  241. main()