save_model_result_in_md_maxwell.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. from sklearn.utils import shuffle
  2. from sklearn.externals import joblib
  3. from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
  4. from sklearn.model_selection import cross_val_score
  5. from sklearn.model_selection import StratifiedKFold
  6. from sklearn.model_selection import train_test_split
  7. from keras.models import Sequential
  8. from keras.layers import Conv1D, MaxPooling1D
  9. from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
  10. from keras.wrappers.scikit_learn import KerasClassifier
  11. from keras import backend as K
  12. from keras.models import model_from_json
  13. import numpy as np
  14. import pandas as pd
  15. from ipfml import processing
  16. from PIL import Image
  17. import sys, os, argparse
  18. import subprocess
  19. import time
  20. import json
  21. from modules.utils import config as cfg
  22. threshold_map_folder = cfg.threshold_map_folder
  23. threshold_map_file_prefix = cfg.threshold_map_folder + "_"
  24. markdowns_folder = cfg.models_information_folder
  25. final_csv_model_comparisons = cfg.csv_model_comparisons_filename
  26. models_name = cfg.models_names_list
  27. zones = cfg.zones_indices
  28. current_dirpath = os.getcwd()
  29. def main():
  30. kind_model = 'keras'
  31. model_ext = ''
  32. parser = argparse.ArgumentParser(description="Display SVD data of scene zone")
  33. parser.add_argument('--interval', type=str, help='Interval value to keep from svd', default='"0, 200"')
  34. parser.add_argument('--model', type=str, help='.joblib or .json file (sklearn or keras model)')
  35. parser.add_argument('--metric', type=str, help='Metric data choice', choices=cfg.metric_choices)
  36. parser.add_argument('--mode', type=str, help='Kind of normalization level wished', choices=cfg.normalization_choices)
  37. args = parser.parse_args()
  38. p_interval = list(map(int, args.interval.split(',')))
  39. p_model_file = args.model
  40. p_metric = args.metric
  41. p_mode = args.mode
  42. # call model and get global result in scenes
  43. begin, end = p_interval
  44. bash_cmd = "bash testModelByScene_maxwell.sh '" + str(begin) + "' '" + str(end) + "' '" + p_model_file + "' '" + p_mode + "' '" + p_metric + "'"
  45. print(bash_cmd)
  46. ## call command ##
  47. p = subprocess.Popen(bash_cmd, stdout=subprocess.PIPE, shell=True)
  48. (output, err) = p.communicate()
  49. ## Wait for result ##
  50. p_status = p.wait()
  51. if not os.path.exists(markdowns_folder):
  52. os.makedirs(markdowns_folder)
  53. # get model name to construct model
  54. if '.joblib' in p_model_file:
  55. kind_model = 'sklearn'
  56. model_ext = '.joblib'
  57. if '.json' in p_model_file:
  58. kind_model = 'keras'
  59. model_ext = '.json'
  60. md_model_path = os.path.join(markdowns_folder, p_model_file.split('/')[-1].replace(model_ext, '.md'))
  61. with open(md_model_path, 'w') as f:
  62. f.write(output.decode("utf-8"))
  63. # read each threshold_map information if exists
  64. model_map_info_path = os.path.join(threshold_map_folder, p_model_file.replace('saved_models/', ''))
  65. if not os.path.exists(model_map_info_path):
  66. f.write('\n\n No threshold map information')
  67. else:
  68. maps_files = os.listdir(model_map_info_path)
  69. # get all map information
  70. for t_map_file in maps_files:
  71. file_path = os.path.join(model_map_info_path, t_map_file)
  72. with open(file_path, 'r') as map_file:
  73. title_scene = t_map_file.replace(threshold_map_file_prefix, '')
  74. f.write('\n\n## ' + title_scene + '\n')
  75. content = map_file.readlines()
  76. # getting each map line information
  77. for line in content:
  78. f.write(line)
  79. f.close()
  80. # Keep model information to compare
  81. current_model_name = p_model_file.split('/')[-1].replace(model_ext, '')
  82. # Prepare writing in .csv file
  83. output_final_file_path = os.path.join(markdowns_folder, final_csv_model_comparisons)
  84. output_final_file = open(output_final_file_path, "a")
  85. print(current_model_name)
  86. # reconstruct data filename
  87. for name in models_name:
  88. if name in current_model_name:
  89. data_filename = current_model_name
  90. current_data_file_path = os.path.join('data', data_filename)
  91. print("Current data file ")
  92. print(current_data_file_path)
  93. model_scores = []
  94. ########################
  95. # 1. Get and prepare data
  96. ########################
  97. dataset_train = pd.read_csv(current_data_file_path + '.train', header=None, sep=";")
  98. dataset_test = pd.read_csv(current_data_file_path + '.test', header=None, sep=";")
  99. # default first shuffle of data
  100. dataset_train = shuffle(dataset_train)
  101. dataset_test = shuffle(dataset_test)
  102. # get dataset with equal number of classes occurences
  103. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  104. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  105. nb_noisy_train = len(noisy_df_train.index)
  106. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  107. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  108. nb_noisy_test = len(noisy_df_test.index)
  109. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  110. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  111. # shuffle data another time
  112. final_df_train = shuffle(final_df_train)
  113. final_df_test = shuffle(final_df_test)
  114. final_df_train_size = len(final_df_train.index)
  115. final_df_test_size = len(final_df_test.index)
  116. # use of the whole data set for training
  117. x_dataset_train = final_df_train.ix[:,1:]
  118. x_dataset_test = final_df_test.ix[:,1:]
  119. y_dataset_train = final_df_train.ix[:,0]
  120. y_dataset_test = final_df_test.ix[:,0]
  121. #######################
  122. # 2. Getting model
  123. #######################
  124. if kind_model == 'keras':
  125. with open(p_model_file, 'r') as f:
  126. json_model = json.load(f)
  127. model = model_from_json(json_model)
  128. model.load_weights(p_model_file.replace('.json', '.h5'))
  129. model.compile(loss='binary_crossentropy',
  130. optimizer='adam',
  131. metrics=['accuracy'])
  132. # reshape all input data
  133. x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), end, 1)
  134. x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), end, 1)
  135. if kind_model == 'sklearn':
  136. model = joblib.load(p_model_file)
  137. #######################
  138. # 3. Fit model : use of cross validation to fit model
  139. #######################
  140. if kind_model == 'keras':
  141. model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
  142. if kind_model == 'sklearn':
  143. model.fit(x_dataset_train, y_dataset_train)
  144. train_accuracy = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  145. ######################
  146. # 4. Test : Validation and test dataset from .test dataset
  147. ######################
  148. # we need to specify validation size to 20% of whole dataset
  149. val_set_size = int(final_df_train_size/3)
  150. test_set_size = val_set_size
  151. total_validation_size = val_set_size + test_set_size
  152. if final_df_test_size > total_validation_size:
  153. x_dataset_test = x_dataset_test[0:total_validation_size]
  154. y_dataset_test = y_dataset_test[0:total_validation_size]
  155. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  156. if kind_model == 'keras':
  157. y_test_model = model.predict_classes(X_test)
  158. y_val_model = model.predict_classes(X_val)
  159. y_train_model = model.predict_classes(x_dataset_train)
  160. train_accuracy = accuracy_score(y_dataset_train, y_train_model)
  161. if kind_model == 'sklearn':
  162. y_test_model = model.predict(X_test)
  163. y_val_model = model.predict(X_val)
  164. y_train_model = model.predict(x_dataset_train)
  165. val_accuracy = accuracy_score(y_val, y_val_model)
  166. test_accuracy = accuracy_score(y_test, y_test_model)
  167. train_f1 = f1_score(y_dataset_train, y_train_model)
  168. train_recall = recall_score(y_dataset_train, y_train_model)
  169. train_roc_auc = roc_auc_score(y_dataset_train, y_train_model)
  170. val_f1 = f1_score(y_val, y_val_model)
  171. val_recall = recall_score(y_val, y_val_model)
  172. val_roc_auc = roc_auc_score(y_val, y_val_model)
  173. test_f1 = f1_score(y_test, y_test_model)
  174. test_recall = recall_score(y_test, y_test_model)
  175. test_roc_auc = roc_auc_score(y_test, y_test_model)
  176. if kind_model == 'keras':
  177. # stats of all dataset
  178. all_x_data = np.concatenate([x_dataset_train, X_test, X_val])
  179. all_y_data = np.concatenate([y_dataset_train, y_test, y_val])
  180. all_y_model = model.predict_classes(all_x_data)
  181. if kind_model == 'sklearn':
  182. # stats of all dataset
  183. all_x_data = pd.concat([x_dataset_train, X_test, X_val])
  184. all_y_data = pd.concat([y_dataset_train, y_test, y_val])
  185. all_y_model = model.predict(all_x_data)
  186. all_accuracy = accuracy_score(all_y_data, all_y_model)
  187. all_f1_score = f1_score(all_y_data, all_y_model)
  188. all_recall_score = recall_score(all_y_data, all_y_model)
  189. all_roc_auc_score = roc_auc_score(all_y_data, all_y_model)
  190. # stats of dataset sizes
  191. total_samples = final_df_train_size + val_set_size + test_set_size
  192. model_scores.append(final_df_train_size)
  193. model_scores.append(val_set_size)
  194. model_scores.append(test_set_size)
  195. model_scores.append(final_df_train_size / total_samples)
  196. model_scores.append(val_set_size / total_samples)
  197. model_scores.append(test_set_size / total_samples)
  198. # add of scores
  199. model_scores.append(train_accuracy)
  200. model_scores.append(val_accuracy)
  201. model_scores.append(test_accuracy)
  202. model_scores.append(all_accuracy)
  203. model_scores.append(train_f1)
  204. model_scores.append(train_recall)
  205. model_scores.append(train_roc_auc)
  206. model_scores.append(val_f1)
  207. model_scores.append(val_recall)
  208. model_scores.append(val_roc_auc)
  209. model_scores.append(test_f1)
  210. model_scores.append(test_recall)
  211. model_scores.append(test_roc_auc)
  212. model_scores.append(all_f1_score)
  213. model_scores.append(all_recall_score)
  214. model_scores.append(all_roc_auc_score)
  215. # TODO : improve...
  216. # check if it's always the case...
  217. nb_zones = current_data_file_path.split('_')[7]
  218. final_file_line = current_model_name + '; ' + str(end - begin) + '; ' + str(begin) + '; ' + str(end) + '; ' + str(nb_zones) + '; ' + p_metric + '; ' + p_mode
  219. for s in model_scores:
  220. final_file_line += '; ' + str(s)
  221. output_final_file.write(final_file_line + '\n')
  222. if __name__== "__main__":
  223. main()