save_model_result_in_md_maxwell.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. from sklearn.utils import shuffle
  2. from sklearn.externals import joblib
  3. from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
  4. from sklearn.model_selection import cross_val_score
  5. from sklearn.model_selection import train_test_split
  6. import numpy as np
  7. import pandas as pd
  8. from ipfml import processing
  9. from PIL import Image
  10. import sys, os, getopt
  11. import subprocess
  12. import time
  13. from modules.utils import config as cfg
  14. threshold_map_folder = cfg.threshold_map_folder
  15. threshold_map_file_prefix = cfg.threshold_map_folder + "_"
  16. markdowns_folder = cfg.models_information_folder
  17. final_csv_model_comparisons = cfg.csv_model_comparisons_filename
  18. models_name = cfg.models_names_list
  19. zones = cfg.zones_indices
  20. current_dirpath = os.getcwd()
  21. def main():
  22. if len(sys.argv) <= 1:
  23. print('Run with default parameters...')
  24. print('python save_model_result_in_md.py --interval "0,20" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  25. sys.exit(2)
  26. try:
  27. opts, args = getopt.getopt(sys.argv[1:], "ht:m:o:l", ["help=", "interval=", "model=", "mode=", "metric="])
  28. except getopt.GetoptError:
  29. # print help information and exit:
  30. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  31. sys.exit(2)
  32. for o, a in opts:
  33. if o == "-h":
  34. print('python save_model_result_in_md.py --interval "xx,xx" --model path/to/xxxx.joblib --mode ["svd", "svdn", "svdne"] --metric ["lab", "mscn"]')
  35. sys.exit()
  36. elif o in ("-t", "--interval"):
  37. p_interval = list(map(int, a.split(',')))
  38. elif o in ("-m", "--model"):
  39. p_model_file = a
  40. elif o in ("-o", "--mode"):
  41. p_mode = a
  42. if p_mode != 'svdn' and p_mode != 'svdne' and p_mode != 'svd':
  43. assert False, "Mode not recognized"
  44. elif o in ("-m", "--metric"):
  45. p_metric = a
  46. else:
  47. assert False, "unhandled option"
  48. # call model and get global result in scenes
  49. begin, end = p_interval
  50. bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
  51. print(bash_cmd)
  52. ## call command ##
  53. p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
  54. (output, err) = p.communicate()
  55. ## Wait for result ##
  56. p_status = p.wait()
  57. if not os.path.exists(markdowns_folder):
  58. os.makedirs(markdowns_folder)
  59. # get model name to construct model
  60. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace('.joblib', '.md'))
  61. with open(md_model_path, 'w') as f:
  62. f.write(output.decode("utf-8"))
  63. # read each threshold_map information if exists
  64. model_map_info_path = os.path.join(threshold_map_folder, p_model_file.replace('saved_models/', ''))
  65. if not os.path.exists(model_map_info_path):
  66. f.write('\n\n No threshold map information')
  67. else:
  68. maps_files = os.listdir(model_map_info_path)
  69. # get all map information
  70. for t_map_file in maps_files:
  71. file_path = os.path.join(model_map_info_path, t_map_file)
  72. with open(file_path, 'r') as map_file:
  73. title_scene = t_map_file.replace(threshold_map_file_prefix, '')
  74. f.write('\n\n## ' + title_scene + '\n')
  75. content = map_file.readlines()
  76. # getting each map line information
  77. for line in content:
  78. f.write(line)
  79. f.close()
  80. # Keep model information to compare
  81. current_model_name = p_model_file.split('/')[-1].replace('.joblib', '')
  82. # Prepare writing in .csv file
  83. output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
  84. output_final_file = open(output_final_file_path, "a")
  85. print(current_model_name)
  86. # reconstruct data filename
  87. for name in models_name:
  88. if name in current_model_name:
  89. current_data_file_path = os.path.join('data', current_model_name.replace(name, 'data_maxwell'))
  90. model_scores = []
  91. ########################
  92. # 1. Get and prepare data
  93. ########################
  94. dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
  95. dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
  96. # default first shuffle of data
  97. dataset_train = shuffle(dataset_train)
  98. dataset_test = shuffle(dataset_test)
  99. # get dataset with equal number of classes occurences
  100. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  101. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  102. nb_noisy_train = len(noisy_df_train.index)
  103. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  104. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  105. nb_noisy_test = len(noisy_df_test.index)
  106. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  107. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  108. # shuffle data another time
  109. final_df_train = shuffle(final_df_train)
  110. final_df_test = shuffle(final_df_test)
  111. final_df_train_size = len(final_df_train.index)
  112. final_df_test_size = len(final_df_test.index)
  113. # use of the whole data set for training
  114. x_dataset_train = final_df_train.ix[:,1:]
  115. x_dataset_test = final_df_test.ix[:,1:]
  116. y_dataset_train = final_df_train.ix[:,0]
  117. y_dataset_test = final_df_test.ix[:,0]
  118. #######################
  119. # 2. Getting model
  120. #######################
  121. model = joblib.load(p_model_file)
  122. #######################
  123. # 3. Fit model : use of cross validation to fit model
  124. #######################
  125. model.fit(x_dataset_train, y_dataset_train)
  126. val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  127. ######################
  128. # 4. Test : Validation and test dataset from .test dataset
  129. ######################
  130. # we need to specify validation size to 20% of whole dataset
  131. val_set_size = int(final_df_train_size/3)
  132. test_set_size = val_set_size
  133. total_validation_size = val_set_size + test_set_size
  134. if final_df_test_size > total_validation_size:
  135. x_dataset_test = x_dataset_test[0:total_validation_size]
  136. y_dataset_test = y_dataset_test[0:total_validation_size]
  137. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  138. y_test_model = model.predict(X_test)
  139. y_val_model = model.predict(X_val)
  140. val_accuracy = accuracy_score(y_val, y_val_model)
  141. test_accuracy = accuracy_score(y_test, y_test_model)
  142. y_train_model = model.predict(x_dataset_train)
  143. train_f1 = f1_score(y_dataset_train, y_train_model)
  144. train_recall = recall_score(y_dataset_train, y_train_model)
  145. train_roc_auc = roc_auc_score(y_dataset_train, y_train_model)
  146. val_f1 = f1_score(y_val, y_val_model)
  147. val_recall = recall_score(y_val, y_val_model)
  148. val_roc_auc = roc_auc_score(y_val, y_val_model)
  149. test_f1 = f1_score(y_test, y_test_model)
  150. test_recall = recall_score(y_test, y_test_model)
  151. test_roc_auc = roc_auc_score(y_test, y_test_model)
  152. # stats of all dataset
  153. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  154. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  155. all_y_model = model.predict(all_x_data)
  156. all_accuracy = accuracy_score(all_y_data, all_y_model)
  157. all_f1_score = f1_score(all_y_data, all_y_model)
  158. all_recall_score = recall_score(all_y_data, all_y_model)
  159. all_roc_auc_score = roc_auc_score(all_y_data, all_y_model)
  160. # stats of dataset sizes
  161. total_samples = final_df_train_size + val_set_size + test_set_size
  162. model_scores.append(final_df_train_size)
  163. model_scores.append(val_set_size)
  164. model_scores.append(test_set_size)
  165. model_scores.append(final_df_train_size / total_samples)
  166. model_scores.append(val_set_size / total_samples)
  167. model_scores.append(test_set_size / total_samples)
  168. # add of scores
  169. model_scores.append(val_scores.mean())
  170. model_scores.append(val_accuracy)
  171. model_scores.append(test_accuracy)
  172. model_scores.append(all_accuracy)
  173. model_scores.append(train_f1)
  174. model_scores.append(train_recall)
  175. model_scores.append(train_roc_auc)
  176. model_scores.append(val_f1)
  177. model_scores.append(val_recall)
  178. model_scores.append(val_roc_auc)
  179. model_scores.append(test_f1)
  180. model_scores.append(test_recall)
  181. model_scores.append(test_roc_auc)
  182. model_scores.append(all_f1_score)
  183. model_scores.append(all_recall_score)
  184. model_scores.append(all_roc_auc_score)
  185. # TODO : improve...
  186. # check if it's always the case...
  187. nb_zones = current_data_file_path.split('_')[7]
  188. final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
  189. for s in model_scores:
  190. final_file_line += '; ' + str(s)
  191. output_final_file.write(final_file_line + '\n')
  192. if __name__== "__main__":
  193. main()