generate_data_model.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 14 21:02:42 2018
  5. @author: jbuisine
  6. """
  7. from __future__ import print_function
  8. import sys, os, argparse
  9. import numpy as np
  10. import random
  11. import time
  12. import json
  13. from PIL import Image
  14. from ipfml import processing, metrics, utils
  15. from modules.utils import config as cfg
  16. from modules.utils import data as dt
  17. # getting configuration information
  18. config_filename = cfg.config_filename
  19. learned_folder = cfg.learned_zones_folder
  20. min_max_filename = cfg.min_max_filename_extension
  21. # define all scenes values
  22. scenes_list = cfg.scenes_names
  23. scenes_indexes = cfg.scenes_indices
  24. choices = cfg.normalization_choices
  25. path = cfg.dataset_path
  26. zones = cfg.zones_indices
  27. seuil_expe_filename = cfg.seuil_expe_filename
  28. renderer_choices = cfg.renderer_choices
  29. normalization_choices = cfg.normalization_choices
  30. metric_choices = cfg.metric_choices_labels
  31. output_data_folder = cfg.output_data_folder
  32. custom_min_max_folder = cfg.min_max_custom_folder
  33. min_max_ext = cfg.min_max_filename_extension
  34. zones_indices = cfg.zones_indices
  35. generic_output_file_svd = '_random.csv'
  36. min_value_interval = sys.maxsize
  37. max_value_interval = 0
  38. def construct_new_line(path_seuil, interval, line, choice, each, norm):
  39. begin, end = interval
  40. line_data = line.split(';')
  41. seuil = line_data[0]
  42. metrics = line_data[begin+1:end+1]
  43. metrics = [float(m) for id, m in enumerate(metrics) if id % each == 0 ]
  44. if norm:
  45. if choice == 'svdne':
  46. metrics = utils.normalize_arr_with_range(metrics, min_value_interval, max_value_interval)
  47. if choice == 'svdn':
  48. metrics = utils.normalize_arr(metrics)
  49. with open(path_seuil, "r") as seuil_file:
  50. seuil_learned = int(seuil_file.readline().strip())
  51. if seuil_learned > int(seuil):
  52. line = '1'
  53. else:
  54. line = '0'
  55. for idx, val in enumerate(metrics):
  56. line += ';'
  57. line += str(val)
  58. line += '\n'
  59. return line
  60. def get_min_max_value_interval(_scenes_list, _interval, _metric):
  61. global min_value_interval, max_value_interval
  62. scenes = os.listdir(path)
  63. # remove min max file from scenes folder
  64. scenes = [s for s in scenes if min_max_filename not in s]
  65. for id_scene, folder_scene in enumerate(scenes):
  66. # only take care of maxwell scenes
  67. if folder_scene in _scenes_list:
  68. scene_path = os.path.join(path, folder_scene)
  69. zones_folder = []
  70. # create zones list
  71. for index in zones:
  72. index_str = str(index)
  73. if len(index_str) < 2:
  74. index_str = "0" + index_str
  75. zones_folder.append("zone"+index_str)
  76. for id_zone, zone_folder in enumerate(zones_folder):
  77. zone_path = os.path.join(scene_path, zone_folder)
  78. data_filename = _metric + "_svd" + generic_output_file_svd
  79. data_file_path = os.path.join(zone_path, data_filename)
  80. # getting number of line and read randomly lines
  81. f = open(data_file_path)
  82. lines = f.readlines()
  83. # check if user select current scene and zone to be part of training data set
  84. for line in lines:
  85. begin, end = _interval
  86. line_data = line.split(';')
  87. metrics = line_data[begin+1:end+1]
  88. metrics = [float(m) for m in metrics]
  89. min_value = min(metrics)
  90. max_value = max(metrics)
  91. if min_value < min_value_interval:
  92. min_value_interval = min_value
  93. if max_value > max_value_interval:
  94. max_value_interval = max_value
  95. def generate_data_model(_filename, _interval, _choice, _metric, _scenes = scenes_list, _zones = zones_indices, _percent = 1, _step=1, _each=1, _norm=False, _custom=False):
  96. output_train_filename = _filename + ".train"
  97. output_test_filename = _filename + ".test"
  98. if not '/' in output_train_filename:
  99. raise Exception("Please select filename with directory path to save data. Example : data/dataset")
  100. # create path if not exists
  101. if not os.path.exists(output_data_folder):
  102. os.makedirs(output_data_folder)
  103. train_file = open(output_train_filename, 'w')
  104. test_file = open(output_test_filename, 'w')
  105. for id_scene, folder_scene in enumerate(scenes_list):
  106. # only take care of maxwell scenes
  107. scene_path = os.path.join(path, folder_scene)
  108. zones_indices = zones
  109. # write into file
  110. folder_learned_path = os.path.join(learned_folder, _filename.split('/')[1])
  111. if not os.path.exists(folder_learned_path):
  112. os.makedirs(folder_learned_path)
  113. file_learned_path = os.path.join(folder_learned_path, folder_scene + '.csv')
  114. with open(file_learned_path, 'w') as f:
  115. for i in _zones:
  116. f.write(str(i) + ';')
  117. for id_zone, index_folder in enumerate(zones_indices):
  118. index_str = str(index_folder)
  119. if len(index_str) < 2:
  120. index_str = "0" + index_str
  121. current_zone_folder = "zone" + index_str
  122. zone_path = os.path.join(scene_path, current_zone_folder)
  123. # if custom normalization choices then we use svd values not already normalized
  124. if _custom:
  125. data_filename = _metric + "_svd" + generic_output_file_svd
  126. else:
  127. data_filename = _metric + "_" + _choice + generic_output_file_svd
  128. data_file_path = os.path.join(zone_path, data_filename)
  129. # getting number of line and read randomly lines
  130. f = open(data_file_path)
  131. lines = f.readlines()
  132. num_lines = len(lines)
  133. lines_indexes = np.arange(num_lines)
  134. random.shuffle(lines_indexes)
  135. path_seuil = os.path.join(zone_path, seuil_expe_filename)
  136. counter = 0
  137. # check if user select current scene and zone to be part of training data set
  138. for index in lines_indexes:
  139. image_index = int(lines[index].split(';')[0])
  140. percent = counter / num_lines
  141. if image_index % _step == 0:
  142. line = construct_new_line(path_seuil, _interval, lines[index], _choice, _each, _norm)
  143. if id_zone in _zones and folder_scene in _scenes and percent <= _percent:
  144. train_file.write(line)
  145. else:
  146. test_file.write(line)
  147. counter += 1
  148. f.close()
  149. train_file.close()
  150. test_file.close()
  151. def main():
  152. # getting all params
  153. parser = argparse.ArgumentParser(description="Generate data for model using correlation matrix information from data")
  154. parser.add_argument('--output', type=str, help='output file name desired (.train and .test)')
  155. parser.add_argument('--interval', type=str, help='Interval value to keep from svd', default='"0, 200"')
  156. parser.add_argument('--kind', type=str, help='Kind of normalization level wished', choices=normalization_choices)
  157. parser.add_argument('--metric', type=str, help='Metric data choice', choices=metric_choices)
  158. parser.add_argument('--scenes', type=str, help='List of scenes to use for training data')
  159. parser.add_argument('--zones', type=str, help='Zones indices to use for training data set')
  160. parser.add_argument('--percent', type=float, help='Percent of data use for train and test dataset (by default 1)', default=1.0)
  161. parser.add_argument('--step', type=int, help='Photo step to keep for build datasets', default=1)
  162. parser.add_argument('--each', type=int, help='Each features to keep from interval', default=1)
  163. parser.add_argument('--renderer', type=str, help='Renderer choice in order to limit scenes used', choices=renderer_choices, default='all')
  164. parser.add_argument('--custom', type=str, help='Name of custom min max file if use of renormalization of data', default=False)
  165. args = parser.parse_args()
  166. p_filename = args.output
  167. p_interval = list(map(int, args.interval.split(',')))
  168. p_kind = args.kind
  169. p_metric = args.metric
  170. p_scenes = args.scenes.split(',')
  171. p_zones = list(map(int, args.zones.split(',')))
  172. p_percent = args.percent
  173. p_step = args.step
  174. p_each = args.each
  175. p_renderer = args.renderer
  176. p_custom = args.custom
  177. # list all possibles choices of renderer
  178. scenes_list = dt.get_renderer_scenes_names(p_renderer)
  179. scenes_indices = dt.get_renderer_scenes_indices(p_renderer)
  180. # getting scenes from indexes user selection
  181. scenes_selected = []
  182. for scene_id in p_scenes:
  183. index = scenes_indexes.index(scene_id.strip())
  184. scenes_selected.append(scenes_list[index])
  185. # find min max value if necessary to renormalize data
  186. if p_custom:
  187. get_min_max_value_interval(scenes_list, p_interval, p_metric)
  188. # write new file to save
  189. if not os.path.exists(custom_min_max_folder):
  190. os.makedirs(custom_min_max_folder)
  191. min_max_folder_path = os.path.join(os.path.dirname(__file__), custom_min_max_folder)
  192. min_max_filename_path = os.path.join(min_max_folder_path, p_custom)
  193. with open(min_max_filename_path, 'w') as f:
  194. f.write(str(min_value_interval) + '\n')
  195. f.write(str(max_value_interval) + '\n')
  196. # create database using img folder (generate first time only)
  197. generate_data_model(p_filename, p_interval, p_kind, p_metric, scenes_selected, p_zones, p_percent, p_step, p_each, p_custom)
  198. if __name__== "__main__":
  199. main()