save_model_result_in_md_maxwell.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. from sklearn.utils import shuffle
  2. from sklearn.externals import joblib
  3. from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
  4. from sklearn.model_selection import cross_val_score
  5. from sklearn.model_selection import StratifiedKFold
  6. from sklearn.model_selection import train_test_split
  7. from keras.models import Sequential
  8. from keras.layers import Conv1D, MaxPooling1D
  9. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  10. from keras import backend as K
  11. from keras.models import model_from_json
  12. import numpy as np
  13. import pandas as pd
  14. from ipfml import processing
  15. from PIL import Image
  16. import sys, os, getopt
  17. import subprocess
  18. import time
  19. import json
  20. from modules.utils import config as cfg
  21. threshold_map_folder = cfg.threshold_map_folder
  22. threshold_map_file_prefix = cfg.threshold_map_folder + "_"
  23. markdowns_folder = cfg.models_information_folder
  24. final_csv_model_comparisons = cfg.csv_model_comparisons_filename
  25. models_name = cfg.models_names_list
  26. zones = cfg.zones_indices
  27. current_dirpath = os.getcwd()
  28. def main():
  29. kind_model = 'keras'
  30. if len(sys.argv) <= 1:
  31. print('Run with default parameters...')
  32. print('python save_model_result_in_md.py --interval "0,20" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  33. sys.exit(2)
  34. try:
  35. opts, args = getopt.getopt(sys.argv[1:], "ht:m:o:l", ["help=", "interval=", "model=", "mode=", "metric="])
  36. except getopt.GetoptError:
  37. # print help information and exit:
  38. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  39. sys.exit(2)
  40. for o, a in opts:
  41. if o == "-h":
  42. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  43. sys.exit()
  44. elif o in ("-t", "--interval"):
  45. p_interval = list(map(int, a.split(',')))
  46. elif o in ("-m", "--model"):
  47. p_model_file = a
  48. elif o in ("-o", "--mode"):
  49. p_mode = a
  50. if p_mode != 'svdn' and p_mode != 'svdne' and p_mode != 'svd':
  51. assert False, "Mode not recognized"
  52. elif o in ("-m", "--metric"):
  53. p_metric = a
  54. else:
  55. assert False, "unhandled option"
  56. # call model and get global result in scenes
  57. begin, end = p_interval
  58. bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
  59. print(bash_cmd)
  60. ## call command ##
  61. #p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
  62. #(output, err) = p.communicate()
  63. ## Wait for result ##
  64. #p_status = p.wait()
  65. if not os.path.exists(markdowns_folder):
  66. os.makedirs(markdowns_folder)
  67. # get model name to construct model
  68. if '.joblib' in p_model_file:
  69. kind_model = 'sklearn'
  70. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace('.joblib', '.md'))
  71. if '.json' in p_model_file:
  72. kind_model = 'keras'
  73. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace('.json', '.md'))
  74. with open(md_model_path, 'w') as f:
  75. f.write(output.decode("utf-8"))
  76. # read each threshold_map information if exists
  77. model_map_info_path = os.path.join(threshold_map_folder, p_model_file.replace('saved_models/', ''))
  78. if not os.path.exists(model_map_info_path):
  79. f.write('\n\n No threshold map information')
  80. else:
  81. maps_files = os.listdir(model_map_info_path)
  82. # get all map information
  83. for t_map_file in maps_files:
  84. file_path = os.path.join(model_map_info_path, t_map_file)
  85. with open(file_path, 'r') as map_file:
  86. title_scene = t_map_file.replace(threshold_map_file_prefix, '')
  87. f.write('\n\n## ' + title_scene + '\n')
  88. content = map_file.readlines()
  89. # getting each map line information
  90. for line in content:
  91. f.write(line)
  92. f.close()
  93. # Keep model information to compare
  94. current_model_name = p_model_file.split('/')[-1].replace('.json', '')
  95. # Prepare writing in .csv file
  96. output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
  97. output_final_file = open(output_final_file_path, "a")
  98. print(current_model_name)
  99. # reconstruct data filename
  100. for name in models_name:
  101. if name in current_model_name:
  102. current_data_file_path = os.path.join('data', current_model_name.replace(name, 'data_maxwell'))
  103. print("Current data file ")
  104. print(current_data_file_path)
  105. model_scores = []
  106. ########################
  107. # 1. Get and prepare data
  108. ########################
  109. dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
  110. dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
  111. # default first shuffle of data
  112. dataset_train = shuffle(dataset_train)
  113. dataset_test = shuffle(dataset_test)
  114. # get dataset with equal number of classes occurences
  115. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  116. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  117. nb_noisy_train = len(noisy_df_train.index)
  118. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  119. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  120. nb_noisy_test = len(noisy_df_test.index)
  121. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  122. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  123. # shuffle data another time
  124. final_df_train = shuffle(final_df_train)
  125. final_df_test = shuffle(final_df_test)
  126. final_df_train_size = len(final_df_train.index)
  127. final_df_test_size = len(final_df_test.index)
  128. # use of the whole data set for training
  129. x_dataset_train = final_df_train.ix[:,1:]
  130. x_dataset_test = final_df_test.ix[:,1:]
  131. y_dataset_train = final_df_train.ix[:,0]
  132. y_dataset_test = final_df_test.ix[:,0]
  133. #######################
  134. # 2. Getting model
  135. #######################
  136. model = joblib.load(p_model_file)
  137. #######################
  138. # 3. Fit model : use of cross validation to fit model
  139. #######################
  140. model.fit(x_dataset_train, y_dataset_train)
  141. val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  142. ######################
  143. # 4. Test : Validation and test dataset from .test dataset
  144. ######################
  145. # we need to specify validation size to 20% of whole dataset
  146. val_set_size = int(final_df_train_size/3)
  147. test_set_size = val_set_size
  148. total_validation_size = val_set_size + test_set_size
  149. if final_df_test_size > total_validation_size:
  150. x_dataset_test = x_dataset_test[0:total_validation_size]
  151. y_dataset_test = y_dataset_test[0:total_validation_size]
  152. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  153. y_test_model = model.predict(X_test)
  154. y_val_model = model.predict(X_val)
  155. val_accuracy = accuracy_score(y_val, y_val_model)
  156. test_accuracy = accuracy_score(y_test, y_test_model)
  157. y_train_model = model.predict(x_dataset_train)
  158. train_f1 = f1_score(y_dataset_train, y_train_model)
  159. train_recall = recall_score(y_dataset_train, y_train_model)
  160. train_roc_auc = roc_auc_score(y_dataset_train, y_train_model)
  161. val_f1 = f1_score(y_val, y_val_model)
  162. val_recall = recall_score(y_val, y_val_model)
  163. val_roc_auc = roc_auc_score(y_val, y_val_model)
  164. test_f1 = f1_score(y_test, y_test_model)
  165. test_recall = recall_score(y_test, y_test_model)
  166. test_roc_auc = roc_auc_score(y_test, y_test_model)
  167. # stats of all dataset
  168. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  169. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  170. all_y_model = model.predict(all_x_data)
  171. all_accuracy = accuracy_score(all_y_data, all_y_model)
  172. all_f1_score = f1_score(all_y_data, all_y_model)
  173. all_recall_score = recall_score(all_y_data, all_y_model)
  174. all_roc_auc_score = roc_auc_score(all_y_data, all_y_model)
  175. # stats of dataset sizes
  176. total_samples = final_df_train_size + val_set_size + test_set_size
  177. model_scores.append(final_df_train_size)
  178. model_scores.append(val_set_size)
  179. model_scores.append(test_set_size)
  180. model_scores.append(final_df_train_size / total_samples)
  181. model_scores.append(val_set_size / total_samples)
  182. model_scores.append(test_set_size / total_samples)
  183. # add of scores
  184. #model_scores.append(val_scores.mean())
  185. model_scores.append(val_accuracy)
  186. model_scores.append(test_accuracy)
  187. model_scores.append(all_accuracy)
  188. model_scores.append(train_f1)
  189. model_scores.append(train_recall)
  190. model_scores.append(train_roc_auc)
  191. model_scores.append(val_f1)
  192. model_scores.append(val_recall)
  193. model_scores.append(val_roc_auc)
  194. model_scores.append(test_f1)
  195. model_scores.append(test_recall)
  196. model_scores.append(test_roc_auc)
  197. model_scores.append(all_f1_score)
  198. model_scores.append(all_recall_score)
  199. model_scores.append(all_roc_auc_score)
  200. # TODO : improve...
  201. # check if it's always the case...
  202. nb_zones = current_data_file_path.split('_')[7]
  203. final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
  204. for s in model_scores:
  205. final_file_line += '; ' + str(s)
  206. output_final_file.write(final_file_line + '\n')
  207. if __name__== "__main__":
  208. main()