generate_data_model.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 14 21:02:42 2018
  5. @author: jbuisine
  6. """
  7. from __future__ import print_function
  8. import sys, os, getopt
  9. import numpy as np
  10. import random
  11. import time
  12. import json
  13. from PIL import Image
  14. from ipfml import processing, metrics, utils
  15. from modules.utils import config as cfg
  16. # getting configuration information
  17. config_filename = cfg.config_filename
  18. zone_folder = cfg.zone_folder
  19. min_max_filename = cfg.min_max_filename_extension
  20. # define all scenes values
  21. scenes_list = cfg.scenes_names
  22. scenes_indexes = cfg.scenes_indices
  23. choices = cfg.normalization_choices
  24. path = cfg.dataset_path
  25. zones = cfg.zones_indices
  26. seuil_expe_filename = cfg.seuil_expe_filename
  27. metric_choices = cfg.metric_choices_labels
  28. output_data_folder = cfg.output_data_folder
  29. custom_min_max_folder = cfg.min_max_custom_folder
  30. min_max_ext = cfg.min_max_filename_extension
  31. zones_indices = cfg.zones_indices
  32. generic_output_file_svd = '_random.csv'
  33. min_value_interval = sys.maxsize
  34. max_value_interval = 0
  35. def construct_new_line(path_seuil, interval, line, norm, sep, index):
  36. begin, end = interval
  37. line_data = line.split(';')
  38. seuil = line_data[0]
  39. metrics = line_data[begin+1:end+1]
  40. metrics = [float(m) for m in metrics]
  41. # TODO : check if it's always necessary to do that (loss of information for svd)
  42. if norm:
  43. metrics = utils.normalize_arr_with_range(metrics, min_value_interval, max_value_interval)
  44. with open(path_seuil, "r") as seuil_file:
  45. seuil_learned = int(seuil_file.readline().strip())
  46. if seuil_learned > int(seuil):
  47. line = '1'
  48. else:
  49. line = '0'
  50. for idx, val in enumerate(metrics):
  51. if index:
  52. line += " " + str(idx + 1)
  53. line += sep
  54. line += str(val)
  55. line += '\n'
  56. return line
  57. def get_min_max_value_interval(_filename, _interval, _choice, _metric):
  58. global min_value_interval, max_value_interval
  59. scenes = os.listdir(path)
  60. # remove min max file from scenes folder
  61. scenes = [s for s in scenes if min_max_filename not in s]
  62. for id_scene, folder_scene in enumerate(scenes):
  63. # only take care of maxwell scenes
  64. if folder_scene in scenes_list:
  65. scene_path = os.path.join(path, folder_scene)
  66. zones_folder = []
  67. # create zones list
  68. for index in zones:
  69. index_str = str(index)
  70. if len(index_str) < 2:
  71. index_str = "0" + index_str
  72. zones_folder.append("zone"+index_str)
  73. # shuffle list of zones (=> randomly choose zones)
  74. random.shuffle(zones_folder)
  75. for id_zone, zone_folder in enumerate(zones_folder):
  76. zone_path = os.path.join(scene_path, zone_folder)
  77. data_filename = _metric + "_" + _choice + generic_output_file_svd
  78. data_file_path = os.path.join(zone_path, data_filename)
  79. # getting number of line and read randomly lines
  80. f = open(data_file_path)
  81. lines = f.readlines()
  82. counter = 0
  83. # check if user select current scene and zone to be part of training data set
  84. for line in lines:
  85. begin, end = _interval
  86. line_data = line.split(';')
  87. metrics = line_data[begin+1:end+1]
  88. metrics = [float(m) for m in metrics]
  89. min_value = min(metrics)
  90. max_value = max(metrics)
  91. if min_value < min_value_interval:
  92. min_value_interval = min_value
  93. if max_value > max_value_interval:
  94. max_value_interval = max_value
  95. counter += 1
  96. def generate_data_model(_filename, _interval, _choice, _metric, _scenes = scenes_list, _zones = zones_indices, _percent = 1, _norm = False, _sep=':', _index=True):
  97. output_train_filename = _filename + ".train"
  98. output_test_filename = _filename + ".test"
  99. if not '/' in output_train_filename:
  100. raise Exception("Please select filename with directory path to save data. Example : data/dataset")
  101. # create path if not exists
  102. if not os.path.exists(output_data_folder):
  103. os.makedirs(output_data_folder)
  104. train_file = open(output_train_filename, 'w')
  105. test_file = open(output_test_filename, 'w')
  106. scenes = os.listdir(path)
  107. # remove min max file from scenes folder
  108. scenes = [s for s in scenes if min_max_filename not in s]
  109. for id_scene, folder_scene in enumerate(scenes):
  110. # only take care of maxwell scenes
  111. if folder_scene in scenes_list:
  112. scene_path = os.path.join(path, folder_scene)
  113. zones_folder = []
  114. # create zones list
  115. for index in zones:
  116. index_str = str(index)
  117. if len(index_str) < 2:
  118. index_str = "0" + index_str
  119. zones_folder.append("zone"+index_str)
  120. for id_zone, zone_folder in enumerate(zones_folder):
  121. zone_path = os.path.join(scene_path, zone_folder)
  122. data_filename = _metric + "_" + _choice + generic_output_file_svd
  123. data_file_path = os.path.join(zone_path, data_filename)
  124. # getting number of line and read randomly lines
  125. f = open(data_file_path)
  126. lines = f.readlines()
  127. num_lines = len(lines)
  128. lines_indexes = np.arange(num_lines)
  129. random.shuffle(lines_indexes)
  130. path_seuil = os.path.join(zone_path, seuil_expe_filename)
  131. counter = 0
  132. # check if user select current scene and zone to be part of training data set
  133. for index in lines_indexes:
  134. line = construct_new_line(path_seuil, _interval, lines[index], _norm, _sep, _index)
  135. percent = counter / num_lines
  136. if id_zone in _zones and folder_scene in _scenes and percent <= _percent:
  137. train_file.write(line)
  138. else:
  139. test_file.write(line)
  140. counter += 1
  141. f.close()
  142. train_file.close()
  143. test_file.close()
  144. def main():
  145. p_custom = False
  146. if len(sys.argv) <= 1:
  147. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --sep : --rowindex 1 --custom min_max_filename')
  148. sys.exit(2)
  149. try:
  150. opts, args = getopt.getopt(sys.argv[1:], "ho:i:k:s:z:p:r:c", ["help=", "output=", "interval=", "kind=", "metric=","scenes=", "zones=", "percent=", "sep=", "rowindex=", "custom="])
  151. except getopt.GetoptError:
  152. # print help information and exit:
  153. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --sep : --rowindex 1 --custom min_max_filename')
  154. sys.exit(2)
  155. for o, a in opts:
  156. if o == "-h":
  157. print('python generate_data_model.py --output xxxx --interval 0,20 --kind svdne --metric lab --scenes "A, B, D" --zones "1, 2, 3, 4" --percent 0.7 --sep : --rowindex 1 --custom min_max_filename')
  158. sys.exit()
  159. elif o in ("-o", "--output"):
  160. p_filename = a
  161. elif o in ("-i", "--interval"):
  162. p_interval = list(map(int, a.split(',')))
  163. elif o in ("-k", "--kind"):
  164. p_kind = a
  165. elif o in ("-m", "--metric"):
  166. p_metric = a
  167. elif o in ("-s", "--scenes"):
  168. p_scenes = a.split(',')
  169. elif o in ("-z", "--zones"):
  170. if ',' in a:
  171. p_zones = list(map(int, a.split(',')))
  172. else:
  173. p_zones = [a.strip()]
  174. elif o in ("-p", "--percent"):
  175. p_percent = float(a)
  176. elif o in ("-s", "--sep"):
  177. p_sep = a
  178. elif o in ("-r", "--rowindex"):
  179. if int(a) == 1:
  180. p_rowindex = True
  181. else:
  182. p_rowindex = False
  183. elif o in ("-c", "--custom"):
  184. p_custom = a
  185. else:
  186. assert False, "unhandled option"
  187. # getting scenes from indexes user selection
  188. scenes_selected = []
  189. for scene_id in p_scenes:
  190. index = scenes_indexes.index(scene_id.strip())
  191. scenes_selected.append(scenes_list[index])
  192. # find min max value if necessary to renormalize data
  193. if p_custom:
  194. get_min_max_value_interval(p_filename, p_interval, p_kind, p_metric)
  195. # write new file to save
  196. if not os.path.exists(custom_min_max_folder):
  197. os.makedirs(custom_min_max_folder)
  198. min_max_folder_path = os.path.join(os.path.dirname(__file__), custom_min_max_folder)
  199. min_max_filename_path = os.path.join(min_max_folder_path, p_custom)
  200. with open(min_max_filename_path, 'w') as f:
  201. f.write(str(min_value_interval) + '\n')
  202. f.write(str(max_value_interval) + '\n')
  203. # create database using img folder (generate first time only)
  204. generate_data_model(p_filename, p_interval, p_kind, p_metric, scenes_selected, p_zones, p_percent, p_custom, p_sep, p_rowindex)
  205. if __name__== "__main__":
  206. main()