generate_data_model.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 14 21:02:42 2018
  5. @author: jbuisine
  6. """
  7. from __future__ import print_function
  8. import sys, os, getopt
  9. import numpy as np
  10. import random
  11. import time
  12. import json
  13. from PIL import Image
  14. from ipfml import processing, metrics, utils
  15. from modules.utils import config as cfg
  16. from modules.utils import data as dt
  17. # getting configuration information
  18. config_filename = cfg.config_filename
  19. zone_folder = cfg.zone_folder
  20. min_max_filename = cfg.min_max_filename_extension
  21. # define all scenes values
  22. scenes_list = cfg.scenes_names
  23. scenes_indexes = cfg.scenes_indices
  24. choices = cfg.normalization_choices
  25. path = cfg.dataset_path
  26. zones = cfg.zones_indices
  27. seuil_expe_filename = cfg.seuil_expe_filename
  28. metric_choices = cfg.metric_choices_labels
  29. output_data_folder = cfg.output_data_folder
  30. custom_min_max_folder = cfg.min_max_custom_folder
  31. min_max_ext = cfg.min_max_filename_extension
  32. zones_indices = cfg.zones_indices
  33. generic_output_file_svd = '_random.csv'
  34. min_value_interval = sys.maxsize
  35. max_value_interval = 0
  36. def construct_new_line(path_seuil, interval, line, choice, each, norm):
  37. begin, end = interval
  38. line_data = line.split(';')
  39. seuil = line_data[0]
  40. metrics = line_data[begin+1:end+1]
  41. metrics = [float(m) for id, m in enumerate(metrics) if id % each == 0 ]
  42. if norm:
  43. if choice == 'svdne':
  44. metrics = utils.normalize_arr_with_range(metrics, min_value_interval, max_value_interval)
  45. if choice == 'svdn':
  46. metrics = utils.normalize_arr(metrics)
  47. with open(path_seuil, "r") as seuil_file:
  48. seuil_learned = int(seuil_file.readline().strip())
  49. if seuil_learned > int(seuil):
  50. line = '1'
  51. else:
  52. line = '0'
  53. for idx, val in enumerate(metrics):
  54. line += ';'
  55. line += str(val)
  56. line += '\n'
  57. return line
  58. def get_min_max_value_interval(_scenes_list, _interval, _metric):
  59. global min_value_interval, max_value_interval
  60. scenes = os.listdir(path)
  61. # remove min max file from scenes folder
  62. scenes = [s for s in scenes if min_max_filename not in s]
  63. for id_scene, folder_scene in enumerate(scenes):
  64. # only take care of maxwell scenes
  65. if folder_scene in _scenes_list:
  66. scene_path = os.path.join(path, folder_scene)
  67. zones_folder = []
  68. # create zones list
  69. for index in zones:
  70. index_str = str(index)
  71. if len(index_str) < 2:
  72. index_str = "0" + index_str
  73. zones_folder.append("zone"+index_str)
  74. for id_zone, zone_folder in enumerate(zones_folder):
  75. zone_path = os.path.join(scene_path, zone_folder)
  76. data_filename = _metric + "_svd" + generic_output_file_svd
  77. data_file_path = os.path.join(zone_path, data_filename)
  78. # getting number of line and read randomly lines
  79. f = open(data_file_path)
  80. lines = f.readlines()
  81. # check if user select current scene and zone to be part of training data set
  82. for line in lines:
  83. begin, end = _interval
  84. line_data = line.split(';')
  85. metrics = line_data[begin+1:end+1]
  86. metrics = [float(m) for m in metrics]
  87. min_value = min(metrics)
  88. max_value = max(metrics)
  89. if min_value < min_value_interval:
  90. min_value_interval = min_value
  91. if max_value > max_value_interval:
  92. max_value_interval = max_value
  93. def generate_data_model(_filename, _interval, _choice, _metric, _scenes = scenes_list, _zones = zones_indices, _percent = 1, _step=1, _each=1, _norm=False, _custom=False):
  94. output_train_filename = _filename + ".train"
  95. output_test_filename = _filename + ".test"
  96. if not '/' in output_train_filename:
  97. raise Exception("Please select filename with directory path to save data. Example : data/dataset")
  98. # create path if not exists
  99. if not os.path.exists(output_data_folder):
  100. os.makedirs(output_data_folder)
  101. train_file = open(output_train_filename, 'w')
  102. test_file = open(output_test_filename, 'w')
  103. scenes = os.listdir(path)
  104. # remove min max file from scenes folder
  105. scenes = [s for s in scenes if min_max_filename not in s]
  106. for id_scene, folder_scene in enumerate(scenes):
  107. # only take care of maxwell scenes
  108. if folder_scene in scenes_list:
  109. scene_path = os.path.join(path, folder_scene)
  110. zones_folder = []
  111. # create zones list
  112. for index in zones:
  113. index_str = str(index)
  114. if len(index_str) < 2:
  115. index_str = "0" + index_str
  116. zones_folder.append("zone"+index_str)
  117. for id_zone, zone_folder in enumerate(zones_folder):
  118. zone_path = os.path.join(scene_path, zone_folder)
  119. # if custom normalization choices then we use svd values not already normalized
  120. if _custom:
  121. data_filename = _metric + "_svd" + generic_output_file_svd
  122. else:
  123. data_filename = _metric + "_" + _choice + generic_output_file_svd
  124. data_file_path = os.path.join(zone_path, data_filename)
  125. # getting number of line and read randomly lines
  126. f = open(data_file_path)
  127. lines = f.readlines()
  128. num_lines = len(lines)
  129. lines_indexes = np.arange(num_lines)
  130. random.shuffle(lines_indexes)
  131. path_seuil = os.path.join(zone_path, seuil_expe_filename)
  132. counter = 0
  133. # check if user select current scene and zone to be part of training data set
  134. for index in lines_indexes:
  135. image_index = int(lines[index].split(';')[0])
  136. percent = counter / num_lines
  137. if image_index % _step == 0:
  138. line = construct_new_line(path_seuil, _interval, lines[index], _choice, _each, _norm)
  139. if id_zone in _zones and folder_scene in _scenes and percent <= _percent:
  140. train_file.write(line)
  141. else:
  142. test_file.write(line)
  143. counter += 1
  144. f.close()
  145. train_file.close()
  146. test_file.close()
  147. def main():
  148. p_custom = False
  149. if len(sys.argv) <= 1:
  150. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --renderer all --step 10 --each 1 --custom min_max_filename')
  151. sys.exit(2)
  152. try:
  153. opts, args = getopt.getopt(sys.argv[1:], "ho:i:k:s:z:p:r:c", ["help=", "output=", "interval=", "kind=", "metric=","scenes=", "zones=", "percent=", "renderer=", "step=", "each=", "custom="])
  154. except getopt.GetoptError:
  155. # print help information and exit:
  156. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --renderer all --step 10 --each 1 --custom min_max_filename')
  157. sys.exit(2)
  158. for o, a in opts:
  159. if o == "-h":
  160. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --renderer all --step 10 --each 1 --custom min_max_filename')
  161. sys.exit()
  162. elif o in ("-o", "--output"):
  163. p_filename = a
  164. elif o in ("-i", "--interval"):
  165. p_interval = list(map(int, a.split(',')))
  166. elif o in ("-k", "--kind"):
  167. p_kind = a
  168. elif o in ("-m", "--metric"):
  169. p_metric = a
  170. elif o in ("-s", "--scenes"):
  171. p_scenes = a.split(',')
  172. elif o in ("-z", "--zones"):
  173. if ',' in a:
  174. p_zones = list(map(int, a.split(',')))
  175. else:
  176. p_zones = [a.strip()]
  177. elif o in ("-p", "--percent"):
  178. p_percent = float(a)
  179. elif o in ("-s", "--step"):
  180. p_step = int(a)
  181. elif o in ("-e", "--each"):
  182. p_each = int(a)
  183. elif o in ("-r", "--renderer"):
  184. p_renderer = a
  185. if p_renderer not in cfg.renderer_choices:
  186. assert False, "Unknown renderer choice, %s" % cfg.renderer_choices
  187. elif o in ("-c", "--custom"):
  188. p_custom = a
  189. else:
  190. assert False, "unhandled option"
  191. # list all possibles choices of renderer
  192. scenes_list = dt.get_renderer_scenes_names(p_renderer)
  193. scenes_indices = dt.get_renderer_scenes_indices(p_renderer)
  194. # getting scenes from indexes user selection
  195. scenes_selected = []
  196. for scene_id in p_scenes:
  197. index = scenes_indexes.index(scene_id.strip())
  198. scenes_selected.append(scenes_list[index])
  199. # find min max value if necessary to renormalize data
  200. if p_custom:
  201. get_min_max_value_interval(scenes_list, p_interval, p_metric)
  202. # write new file to save
  203. if not os.path.exists(custom_min_max_folder):
  204. os.makedirs(custom_min_max_folder)
  205. min_max_folder_path = os.path.join(os.path.dirname(__file__), custom_min_max_folder)
  206. min_max_filename_path = os.path.join(min_max_folder_path, p_custom)
  207. with open(min_max_filename_path, 'w') as f:
  208. f.write(str(min_value_interval) + '\n')
  209. f.write(str(max_value_interval) + '\n')
  210. # create database using img folder (generate first time only)
  211. generate_data_model(p_filename, p_interval, p_kind, p_metric, scenes_selected, p_zones, p_percent, p_step, p_each, p_custom)
  212. if __name__== "__main__":
  213. main()