generate_data_model_corr_random.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 14 21:02:42 2018
  5. @author: jbuisine
  6. """
  7. from __future__ import print_function
  8. import sys, os, argparse
  9. import numpy as np
  10. import pandas as pd
  11. import random
  12. import time
  13. import json
  14. import subprocess
  15. from PIL import Image
  16. from ipfml import processing, metrics, utils
  17. from modules.utils import config as cfg
  18. from modules.utils import data as dt
  19. # getting configuration information
  20. config_filename = cfg.config_filename
  21. learned_folder = cfg.learned_zones_folder
  22. min_max_filename = cfg.min_max_filename_extension
  23. # define all scenes values
  24. all_scenes_list = cfg.scenes_names
  25. all_scenes_indices = cfg.scenes_indices
  26. renderer_choices = cfg.renderer_choices
  27. normalization_choices = cfg.normalization_choices
  28. path = cfg.dataset_path
  29. zones = cfg.zones_indices
  30. seuil_expe_filename = cfg.seuil_expe_filename
  31. metric_choices = cfg.metric_choices_labels
  32. output_data_folder = cfg.output_data_folder
  33. custom_min_max_folder = cfg.min_max_custom_folder
  34. min_max_ext = cfg.min_max_filename_extension
  35. generic_output_file_svd = '_random.csv'
  36. min_value_interval = sys.maxsize
  37. max_value_interval = 0
  38. def construct_new_line(path_seuil, indices, line, choice, norm):
  39. # increase indices values by one to avoid label
  40. f = lambda x : x + 1
  41. indices = f(indices)
  42. line_data = np.array(line.split(';'))
  43. seuil = line_data[0]
  44. metrics = line_data[indices]
  45. metrics = metrics.astype('float32')
  46. # TODO : check if it's always necessary to do that (loss of information for svd)
  47. if norm:
  48. if choice == 'svdne':
  49. metrics = utils.normalize_arr_with_range(metrics, min_value_interval, max_value_interval)
  50. if choice == 'svdn':
  51. metrics = utils.normalize_arr(metrics)
  52. with open(path_seuil, "r") as seuil_file:
  53. seuil_learned = int(seuil_file.readline().strip())
  54. if seuil_learned > int(seuil):
  55. line = '1'
  56. else:
  57. line = '0'
  58. for idx, val in enumerate(metrics):
  59. line += ';'
  60. line += str(val)
  61. line += '\n'
  62. return line
  63. def get_min_max_value_interval(_scenes_list, _indices, _metric):
  64. global min_value_interval, max_value_interval
  65. # increase indices values by one to avoid label
  66. f = lambda x : x + 1
  67. indices = f(_indices)
  68. scenes = os.listdir(path)
  69. # remove min max file from scenes folder
  70. scenes = [s for s in scenes if min_max_filename not in s]
  71. for id_scene, folder_scene in enumerate(scenes):
  72. # only take care of maxwell scenes
  73. if folder_scene in _scenes_list:
  74. scene_path = os.path.join(path, folder_scene)
  75. zones_folder = []
  76. # create zones list
  77. for index in zones:
  78. index_str = str(index)
  79. if len(index_str) < 2:
  80. index_str = "0" + index_str
  81. zones_folder.append("zone"+index_str)
  82. for id_zone, zone_folder in enumerate(zones_folder):
  83. zone_path = os.path.join(scene_path, zone_folder)
  84. # if custom normalization choices then we use svd values not already normalized
  85. data_filename = _metric + "_svd"+ generic_output_file_svd
  86. data_file_path = os.path.join(zone_path, data_filename)
  87. # getting number of line and read randomly lines
  88. f = open(data_file_path)
  89. lines = f.readlines()
  90. # check if user select current scene and zone to be part of training data set
  91. for line in lines:
  92. line_data = np.array(line.split(';'))
  93. metrics = line_data[[_indices]]
  94. metrics = [float(m) for m in metrics]
  95. min_value = min(metrics)
  96. max_value = max(metrics)
  97. if min_value < min_value_interval:
  98. min_value_interval = min_value
  99. if max_value > max_value_interval:
  100. max_value_interval = max_value
  101. def generate_data_model(_scenes_list, _filename, _interval, _choice, _metric, _scenes, _nb_zones = 4, _percent = 1, _random=0, _step=1, _custom = False):
  102. output_train_filename = _filename + ".train"
  103. output_test_filename = _filename + ".test"
  104. if not '/' in output_train_filename:
  105. raise Exception("Please select filename with directory path to save data. Example : data/dataset")
  106. # create path if not exists
  107. if not os.path.exists(output_data_folder):
  108. os.makedirs(output_data_folder)
  109. train_file_data = []
  110. test_file_data = []
  111. for id_scene, folder_scene in enumerate(_scenes_list):
  112. scene_path = os.path.join(path, folder_scene)
  113. zones_indices = zones
  114. # shuffle list of zones (=> randomly choose zones)
  115. # only in random mode
  116. if _random:
  117. random.shuffle(zones_indices)
  118. # store zones learned
  119. learned_zones_indices = zones_indices[:_nb_zones]
  120. # write into file
  121. folder_learned_path = os.path.join(learned_folder, _filename.split('/')[1])
  122. if not os.path.exists(folder_learned_path):
  123. os.makedirs(folder_learned_path)
  124. file_learned_path = os.path.join(folder_learned_path, folder_scene + '.csv')
  125. with open(file_learned_path, 'w') as f:
  126. for i in learned_zones_indices:
  127. f.write(str(i) + ';')
  128. for id_zone, index_folder in enumerate(zones_indices):
  129. index_str = str(index_folder)
  130. if len(index_str) < 2:
  131. index_str = "0" + index_str
  132. current_zone_folder = "zone" + index_str
  133. zone_path = os.path.join(scene_path, current_zone_folder)
  134. # if custom normalization choices then we use svd values not already normalized
  135. if _custom:
  136. data_filename = _metric + "_svd"+ generic_output_file_svd
  137. else:
  138. data_filename = _metric + "_" + _choice + generic_output_file_svd
  139. data_file_path = os.path.join(zone_path, data_filename)
  140. # getting number of line and read randomly lines
  141. f = open(data_file_path)
  142. lines = f.readlines()
  143. num_lines = len(lines)
  144. # randomly shuffle image
  145. if _random:
  146. random.shuffle(lines)
  147. path_seuil = os.path.join(zone_path, seuil_expe_filename)
  148. counter = 0
  149. # check if user select current scene and zone to be part of training data set
  150. for data in lines:
  151. percent = counter / num_lines
  152. image_index = int(data.split(';')[0])
  153. if image_index % _step == 0:
  154. line = construct_new_line(path_seuil, _interval, data, _choice, _custom)
  155. if id_zone < _nb_zones and folder_scene in _scenes and percent <= _percent:
  156. train_file_data.append(line)
  157. else:
  158. test_file_data.append(line)
  159. counter += 1
  160. f.close()
  161. train_file = open(output_train_filename, 'w')
  162. test_file = open(output_test_filename, 'w')
  163. for line in train_file_data:
  164. train_file.write(line)
  165. for line in test_file_data:
  166. test_file.write(line)
  167. train_file.close()
  168. test_file.close()
  169. def main():
  170. # getting all params
  171. parser = argparse.ArgumentParser(description="Generate data for model using correlation matrix information from data")
  172. parser.add_argument('--output', type=str, help='output file name desired (.train and .test)')
  173. parser.add_argument('--n', type=int, help='Number of features wanted')
  174. parser.add_argument('--highest', type=int, help='Specify if highest or lowest values are wishes', choices=[0, 1])
  175. parser.add_argument('--label', type=int, help='Specify if label correlation is used or not', choices=[0, 1])
  176. parser.add_argument('--kind', type=str, help='Kind of normalization level wished', choices=normalization_choices)
  177. parser.add_argument('--metric', type=str, help='Metric data choice', choices=metric_choices)
  178. parser.add_argument('--scenes', type=str, help='List of scenes to use for training data')
  179. parser.add_argument('--nb_zones', type=int, help='Number of zones to use for training data set')
  180. parser.add_argument('--random', type=int, help='Data will be randomly filled or not', choices=[0, 1])
  181. parser.add_argument('--percent', type=float, help='Percent of data use for train and test dataset (by default 1)')
  182. parser.add_argument('--step', type=int, help='Photo step to keep for build datasets', default=1)
  183. parser.add_argument('--renderer', type=str, help='Renderer choice in order to limit scenes used', choices=renderer_choices, default='all')
  184. parser.add_argument('--custom', type=str, help='Name of custom min max file if use of renormalization of data', default=False)
  185. args = parser.parse_args()
  186. p_filename = args.output
  187. p_n = args.n
  188. p_highest = args.highest
  189. p_label = args.label
  190. p_kind = args.kind
  191. p_metric = args.metric
  192. p_scenes = args.scenes.split(',')
  193. p_nb_zones = args.nb_zones
  194. p_random = args.random
  195. p_percent = args.percent
  196. p_step = args.step
  197. p_renderer = args.renderer
  198. p_custom = args.custom
  199. # list all possibles choices of renderer
  200. scenes_list = dt.get_renderer_scenes_names(p_renderer)
  201. scenes_indices = dt.get_renderer_scenes_indices(p_renderer)
  202. # getting scenes from indexes user selection
  203. scenes_selected = []
  204. for scene_id in p_scenes:
  205. index = scenes_indices.index(scene_id.strip())
  206. scenes_selected.append(scenes_list[index])
  207. # Get indices to keep from correlation information
  208. # compute temp data file to get correlation information
  209. temp_filename = 'temp'
  210. temp_filename_path = os.path.join(cfg.output_data_folder, temp_filename)
  211. cmd = ['python', 'generate_data_model_random.py',
  212. '--output', temp_filename_path,
  213. '--interval', '0, 200',
  214. '--kind', p_kind,
  215. '--metric', p_metric,
  216. '--scenes', args.scenes,
  217. '--nb_zones', str(16),
  218. '--random', str(int(p_random)),
  219. '--percent', str(p_percent),
  220. '--step', str(p_step),
  221. '--each', str(1),
  222. '--renderer', p_renderer,
  223. '--custom', temp_filename + min_max_ext]
  224. subprocess.Popen(cmd).wait()
  225. temp_data_file_path = temp_filename_path + '.train'
  226. df = pd.read_csv(temp_data_file_path, sep=';', header=None)
  227. indices = []
  228. # compute correlation matrix from whole data scenes of renderer (using or not label column)
  229. if p_label:
  230. # compute pearson correlation between features and label
  231. corr = df.corr()
  232. features_corr = []
  233. for id_row, row in enumerate(corr):
  234. for id_col, val in enumerate(corr[row]):
  235. if id_col == 0 and id_row != 0:
  236. features_corr.append(abs(val))
  237. else:
  238. df = df.drop(df.columns[[0]], axis=1)
  239. # compute pearson correlation between features using only features
  240. corr = df[1:200].corr()
  241. features_corr = []
  242. for id_row, row in enumerate(corr):
  243. correlation_score = 0
  244. for id_col, val in enumerate(corr[row]):
  245. if id_col != id_row:
  246. correlation_score += abs(val)
  247. features_corr.append(correlation_score)
  248. # find `n` min or max indices to keep
  249. if p_highest:
  250. indices = utils.get_indices_of_highest_values(features_corr, p_n)
  251. else:
  252. indices = utils.get_indices_of_lowest_values(features_corr, p_n)
  253. indices = np.sort(indices)
  254. # save indices found
  255. if not os.path.exists(cfg.correlation_indices_folder):
  256. os.makedirs(cfg.correlation_indices_folder)
  257. indices_file_path = os.path.join(cfg.correlation_indices_folder, p_filename.replace(cfg.output_data_folder + '/', '') + '.csv')
  258. with open(indices_file_path, 'w') as f:
  259. for i in indices:
  260. f.write(str(i) + ';')
  261. # find min max value if necessary to renormalize data from `n` indices found
  262. if p_custom:
  263. get_min_max_value_interval(scenes_list, indices, p_metric)
  264. # write new file to save
  265. if not os.path.exists(custom_min_max_folder):
  266. os.makedirs(custom_min_max_folder)
  267. min_max_folder_path = os.path.join(os.path.dirname(__file__), custom_min_max_folder)
  268. min_max_current_filename = p_filename.replace(cfg.output_data_folder + '/', '').replace('deep_keras_', '') + min_max_filename
  269. min_max_filename_path = os.path.join(min_max_folder_path, min_max_current_filename)
  270. print(min_max_filename_path)
  271. with open(min_max_filename_path, 'w') as f:
  272. f.write(str(min_value_interval) + '\n')
  273. f.write(str(max_value_interval) + '\n')
  274. # create database using img folder (generate first time only)
  275. generate_data_model(scenes_list, p_filename, indices, p_kind, p_metric, scenes_selected, p_nb_zones, p_percent, p_random, p_step, p_custom)
  276. if __name__== "__main__":
  277. main()