6 years ago · 9d3f09fdf0
--- a/analysis/svd_reconstruction_analysis.ipynb
+++ b/analysis/svd_reconstruction_analysis.ipynb
--- a/analysis/svd_zones_analysis.ipynb
+++ b/analysis/svd_zones_analysis.ipynb
--- a/deep_network_keras_svd.py
+++ b/deep_network_keras_svd.py
@@ -4,7 +4,6 @@ from keras.layers import Conv1D, MaxPooling1D
 
				 from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
			
 
				 from keras.wrappers.scikit_learn import KerasClassifier
			
 
				 from keras import backend as K
			
 
				-import matplotlib.pyplot as plt
			
 
				 
			
 
				 from sklearn.utils import shuffle
			
 
				 from sklearn.metrics import roc_auc_score
			
--- a/generateAndTrain_maxwell_custom_split.sh
+++ b/generateAndTrain_maxwell_custom_split.sh
@@ -0,0 +1,74 @@
 
				+#! bin/bash
			
 
				+
			
 
				+if [ -z "$1" ]
			
 
				+  then
			
 
				+    echo "No argument supplied"
			
 
				+    echo "Need of vector size"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$2" ]
			
 
				+  then
			
 
				+    echo "No argument supplied"
			
 
				+    echo "Need of metric information"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+result_filename="models_info/models_comparisons.csv"
			
 
				+VECTOR_SIZE=200
			
 
				+size=$1
			
 
				+metric=$2
			
 
				+
			
 
				+# selection of four scenes (only maxwell)
			
 
				+scenes="A, D, G, H"
			
 
				+
			
 
				+half=$(($size/2))
			
 
				+start=-$half
			
 
				+for counter in {0..4}; do
			
 
				+    end=$(($start+$size))
			
 
				+
			
 
				+    if [ "$end" -gt "$VECTOR_SIZE" ]; then
			
 
				+        start=$(($VECTOR_SIZE-$size))
			
 
				+        end=$(($VECTOR_SIZE))
			
 
				+    fi
			
 
				+
			
 
				+    if [ "$start" -lt "0" ]; then
			
 
				+        start=$((0))
			
 
				+        end=$(($size))
			
 
				+    fi
			
 
				+
			
 
				+    for nb_zones in {4,6,8,10,12}; do
			
 
				+
			
 
				+        echo $start $end
			
 
				+
			
 
				+        for mode in {"svd","svdn","svdne"}; do
			
 
				+            for model in {"svm_model","ensemble_model","ensemble_model_v2"}; do
			
 
				+
			
 
				+                FILENAME="data/${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${metric}_${mode}"
			
 
				+                MODEL_NAME="${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${metric}_${mode}"
			
 
				+                CUSTOM_MIN_MAX_FILENAME="N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${metric}_${mode}_min_max"
			
 
				+
			
 
				+                echo $FILENAME
			
 
				+
			
 
				+                # only compute if necessary (perhaps server will fall.. Just in case)
			
 
				+                if grep -q "${MODEL_NAME}" "${result_filename}"; then
			
 
				+
			
 
				+                    echo "${MODEL_NAME} results already generated..."
			
 
				+                else
			
 
				+                    python generate_data_model_random_split.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --metric ${metric} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --renderer "maxwell" --step 40 --random 1 --custom ${CUSTOM_MIN_MAX_FILENAME}
			
 
				+                    python train_model.py --data ${FILENAME} --output ${MODEL_NAME} --choice ${model}
			
 
				+
			
 
				+                    #python predict_seuil_expe_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric} --limit_detection '2' --custom ${CUSTOM_MIN_MAX_FILENAME}
			
 
				+                    python save_model_result_in_md_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --metric ${metric}
			
 
				+                fi
			
 
				+            done
			
 
				+        done
			
 
				+    done
			
 
				+
			
 
				+    if [ "$counter" -eq "0" ]; then
			
 
				+        start=$(($start+50-$half))
			
 
				+    else
			
 
				+        start=$(($start+50))
			
 
				+    fi
			
 
				+
			
 
				+done
			
--- a/generate_data_model_random_split.py
+++ b/generate_data_model_random_split.py
@@ -0,0 +1,313 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 14 21:02:42 2018
			
 
				+
			
 
				+@author: jbuisine
			
 
				+"""
			
 
				+
			
 
				+from __future__ import print_function
			
 
				+import sys, os, argparse
			
 
				+import numpy as np
			
 
				+import random
			
 
				+import time
			
 
				+import json
			
 
				+
			
 
				+from PIL import Image
			
 
				+from ipfml import processing, metrics, utils
			
 
				+
			
 
				+from modules.utils import config as cfg
			
 
				+from modules.utils import data as dt
			
 
				+
			
 
				+# getting configuration information
			
 
				+config_filename         = cfg.config_filename
			
 
				+learned_folder          = cfg.learned_zones_folder
			
 
				+min_max_filename        = cfg.min_max_filename_extension
			
 
				+
			
 
				+# define all scenes values
			
 
				+all_scenes_list         = cfg.scenes_names
			
 
				+all_scenes_indices      = cfg.scenes_indices
			
 
				+
			
 
				+normalization_choices   = cfg.normalization_choices
			
 
				+path                    = cfg.dataset_path
			
 
				+zones                   = cfg.zones_indices
			
 
				+seuil_expe_filename     = cfg.seuil_expe_filename
			
 
				+
			
 
				+renderer_choices        = cfg.renderer_choices
			
 
				+metric_choices          = cfg.metric_choices_labels
			
 
				+output_data_folder      = cfg.output_data_folder
			
 
				+custom_min_max_folder   = cfg.min_max_custom_folder
			
 
				+min_max_ext             = cfg.min_max_filename_extension
			
 
				+
			
 
				+generic_output_file_svd = '_random.csv'
			
 
				+
			
 
				+min_value_interval      = sys.maxsize
			
 
				+max_value_interval      = 0
			
 
				+abs_gap_data            = 100
			
 
				+
			
 
				+
			
 
				+def construct_new_line(seuil_learned, interval, line, choice, each, norm):
			
 
				+    begin, end = interval
			
 
				+
			
 
				+    line_data = line.split(';')
			
 
				+    seuil = line_data[0]
			
 
				+    metrics = line_data[begin+1:end+1]
			
 
				+
			
 
				+    # keep only if modulo result is 0 (keep only each wanted values)
			
 
				+    metrics = [float(m) for id, m in enumerate(metrics) if id % each == 0]
			
 
				+
			
 
				+    # TODO : check if it's always necessary to do that (loss of information for svd)
			
 
				+    if norm:
			
 
				+
			
 
				+        if choice == 'svdne':
			
 
				+            metrics = utils.normalize_arr_with_range(metrics, min_value_interval, max_value_interval)
			
 
				+        if choice == 'svdn':
			
 
				+            metrics = utils.normalize_arr(metrics)
			
 
				+
			
 
				+    if seuil_learned > int(seuil):
			
 
				+        line = '1'
			
 
				+    else:
			
 
				+        line = '0'
			
 
				+
			
 
				+    for idx, val in enumerate(metrics):
			
 
				+        line += ';'
			
 
				+        line += str(val)
			
 
				+    line += '\n'
			
 
				+
			
 
				+    return line
			
 
				+
			
 
				+def get_min_max_value_interval(_scenes_list, _interval, _metric):
			
 
				+
			
 
				+    global min_value_interval, max_value_interval
			
 
				+
			
 
				+    scenes = os.listdir(path)
			
 
				+
			
 
				+    # remove min max file from scenes folder
			
 
				+    scenes = [s for s in scenes if min_max_filename not in s]
			
 
				+
			
 
				+    for id_scene, folder_scene in enumerate(scenes):
			
 
				+
			
 
				+        # only take care of maxwell scenes
			
 
				+        if folder_scene in _scenes_list:
			
 
				+
			
 
				+            scene_path = os.path.join(path, folder_scene)
			
 
				+
			
 
				+            zones_folder = []
			
 
				+            # create zones list
			
 
				+            for index in zones:
			
 
				+                index_str = str(index)
			
 
				+                if len(index_str) < 2:
			
 
				+                    index_str = "0" + index_str
			
 
				+                zones_folder.append("zone"+index_str)
			
 
				+
			
 
				+            for id_zone, zone_folder in enumerate(zones_folder):
			
 
				+
			
 
				+                zone_path = os.path.join(scene_path, zone_folder)
			
 
				+
			
 
				+                # if custom normalization choices then we use svd values not already normalized
			
 
				+                data_filename = _metric + "_svd"+ generic_output_file_svd
			
 
				+
			
 
				+                data_file_path = os.path.join(zone_path, data_filename)
			
 
				+
			
 
				+                # getting number of line and read randomly lines
			
 
				+                f = open(data_file_path)
			
 
				+                lines = f.readlines()
			
 
				+
			
 
				+                # check if user select current scene and zone to be part of training data set
			
 
				+                for line in lines:
			
 
				+
			
 
				+                    begin, end = _interval
			
 
				+
			
 
				+                    line_data = line.split(';')
			
 
				+
			
 
				+                    metrics = line_data[begin+1:end+1]
			
 
				+                    metrics = [float(m) for m in metrics]
			
 
				+
			
 
				+                    min_value = min(metrics)
			
 
				+                    max_value = max(metrics)
			
 
				+
			
 
				+                    if min_value < min_value_interval:
			
 
				+                        min_value_interval = min_value
			
 
				+
			
 
				+                    if max_value > max_value_interval:
			
 
				+                        max_value_interval = max_value
			
 
				+
			
 
				+
			
 
				+def generate_data_model(_scenes_list, _filename, _interval, _choice, _metric, _scenes, _nb_zones = 4, _percent = 1, _random=0, _step=1, _each=1, _custom = False):
			
 
				+
			
 
				+    output_train_filename = _filename + ".train"
			
 
				+    output_test_filename = _filename + ".test"
			
 
				+
			
 
				+    if not '/' in output_train_filename:
			
 
				+        raise Exception("Please select filename with directory path to save data. Example : data/dataset")
			
 
				+
			
 
				+    # create path if not exists
			
 
				+    if not os.path.exists(output_data_folder):
			
 
				+        os.makedirs(output_data_folder)
			
 
				+
			
 
				+    train_file_data = []
			
 
				+    test_file_data  = []
			
 
				+
			
 
				+    for id_scene, folder_scene in enumerate(_scenes_list):
			
 
				+
			
 
				+        scene_path = os.path.join(path, folder_scene)
			
 
				+
			
 
				+        zones_indices = zones
			
 
				+
			
 
				+        # shuffle list of zones (=> randomly choose zones)
			
 
				+        # only in random mode
			
 
				+        if _random:
			
 
				+            random.shuffle(zones_indices)
			
 
				+
			
 
				+        # store zones learned
			
 
				+        learned_zones_indices = zones_indices[:_nb_zones]
			
 
				+
			
 
				+        # write into file
			
 
				+        folder_learned_path = os.path.join(learned_folder, _filename.split('/')[1])
			
 
				+
			
 
				+        if not os.path.exists(folder_learned_path):
			
 
				+            os.makedirs(folder_learned_path)
			
 
				+
			
 
				+        file_learned_path = os.path.join(folder_learned_path, folder_scene + '.csv')
			
 
				+
			
 
				+        with open(file_learned_path, 'w') as f:
			
 
				+            for i in learned_zones_indices:
			
 
				+                f.write(str(i) + ';')
			
 
				+
			
 
				+        for id_zone, index_folder in enumerate(zones_indices):
			
 
				+
			
 
				+            index_str = str(index_folder)
			
 
				+            if len(index_str) < 2:
			
 
				+                index_str = "0" + index_str
			
 
				+            current_zone_folder = "zone" + index_str
			
 
				+
			
 
				+            zone_path = os.path.join(scene_path, current_zone_folder)
			
 
				+
			
 
				+            # if custom normalization choices then we use svd values not already normalized
			
 
				+            if _custom:
			
 
				+                data_filename = _metric + "_svd"+ generic_output_file_svd
			
 
				+            else:
			
 
				+                data_filename = _metric + "_" + _choice + generic_output_file_svd
			
 
				+
			
 
				+            data_file_path = os.path.join(zone_path, data_filename)
			
 
				+
			
 
				+            # getting number of line and read randomly lines
			
 
				+            f = open(data_file_path)
			
 
				+            lines = f.readlines()
			
 
				+
			
 
				+            num_lines = len(lines)
			
 
				+
			
 
				+            # randomly shuffle image
			
 
				+            if _random:
			
 
				+                random.shuffle(lines)
			
 
				+
			
 
				+            path_seuil = os.path.join(zone_path, seuil_expe_filename)
			
 
				+
			
 
				+            with open(path_seuil, "r") as seuil_file:
			
 
				+                seuil_learned = int(seuil_file.readline().strip())
			
 
				+
			
 
				+            counter = 0
			
 
				+            # check if user select current scene and zone to be part of training data set
			
 
				+            for data in lines:
			
 
				+
			
 
				+                percent = counter / num_lines
			
 
				+                image_index = int(data.split(';')[0])
			
 
				+
			
 
				+                if image_index % _step == 0:
			
 
				+
			
 
				+                    with open(path_seuil, "r") as seuil_file:
			
 
				+                        seuil_learned = int(seuil_file.readline().strip())
			
 
				+
			
 
				+                    gap_threshold = abs(seuil_learned - image_index)
			
 
				+
			
 
				+                    if gap_threshold > abs_gap_data:
			
 
				+
			
 
				+                        line = construct_new_line(seuil_learned, _interval, data, _choice, _each, _custom)
			
 
				+
			
 
				+                        if id_zone < _nb_zones and folder_scene in _scenes and percent <= _percent:
			
 
				+                            train_file_data.append(line)
			
 
				+                        else:
			
 
				+                            test_file_data.append(line)
			
 
				+
			
 
				+                counter += 1
			
 
				+
			
 
				+            f.close()
			
 
				+
			
 
				+    train_file = open(output_train_filename, 'w')
			
 
				+    test_file = open(output_test_filename, 'w')
			
 
				+
			
 
				+    for line in train_file_data:
			
 
				+        train_file.write(line)
			
 
				+
			
 
				+    for line in test_file_data:
			
 
				+        test_file.write(line)
			
 
				+
			
 
				+    train_file.close()
			
 
				+    test_file.close()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    # getting all params
			
 
				+    parser = argparse.ArgumentParser(description="Generate data for model using correlation matrix information from data")
			
 
				+
			
 
				+    parser.add_argument('--output', type=str, help='output file name desired (.train and .test)')
			
 
				+    parser.add_argument('--interval', type=str, help='Interval value to keep from svd', default='"0, 200"')
			
 
				+    parser.add_argument('--kind', type=str, help='Kind of normalization level wished', choices=normalization_choices)
			
 
				+    parser.add_argument('--metric', type=str, help='Metric data choice', choices=metric_choices)
			
 
				+    parser.add_argument('--scenes', type=str, help='List of scenes to use for training data')
			
 
				+    parser.add_argument('--nb_zones', type=int, help='Number of zones to use for training data set')
			
 
				+    parser.add_argument('--random', type=int, help='Data will be randomly filled or not', choices=[0, 1])
			
 
				+    parser.add_argument('--percent', type=float, help='Percent of data use for train and test dataset (by default 1)')
			
 
				+    parser.add_argument('--step', type=int, help='Photo step to keep for build datasets', default=1)
			
 
				+    parser.add_argument('--each', type=int, help='Each features to keep from interval', default=1)
			
 
				+    parser.add_argument('--renderer', type=str, help='Renderer choice in order to limit scenes used', choices=renderer_choices, default='all')
			
 
				+    parser.add_argument('--custom', type=str, help='Name of custom min max file if use of renormalization of data', default=False)
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    p_filename = args.output
			
 
				+    p_interval = list(map(int, args.interval.split(',')))
			
 
				+    p_kind     = args.kind
			
 
				+    p_metric   = args.metric
			
 
				+    p_scenes   = args.scenes.split(',')
			
 
				+    p_nb_zones = args.nb_zones
			
 
				+    p_random   = args.random
			
 
				+    p_percent  = args.percent
			
 
				+    p_step     = args.step
			
 
				+    p_each     = args.each
			
 
				+    p_renderer = args.renderer
			
 
				+    p_custom   = args.custom
			
 
				+
			
 
				+
			
 
				+    # list all possibles choices of renderer
			
 
				+    scenes_list = dt.get_renderer_scenes_names(p_renderer)
			
 
				+    scenes_indices = dt.get_renderer_scenes_indices(p_renderer)
			
 
				+
			
 
				+    # getting scenes from indexes user selection
			
 
				+    scenes_selected = []
			
 
				+
			
 
				+    for scene_id in p_scenes:
			
 
				+        index = scenes_indices.index(scene_id.strip())
			
 
				+        scenes_selected.append(scenes_list[index])
			
 
				+
			
 
				+    # find min max value if necessary to renormalize data
			
 
				+    if p_custom:
			
 
				+        get_min_max_value_interval(scenes_list, p_interval, p_metric)
			
 
				+
			
 
				+        # write new file to save
			
 
				+        if not os.path.exists(custom_min_max_folder):
			
 
				+            os.makedirs(custom_min_max_folder)
			
 
				+
			
 
				+        min_max_folder_path = os.path.join(os.path.dirname(__file__), custom_min_max_folder)
			
 
				+        min_max_filename_path = os.path.join(min_max_folder_path, p_custom)
			
 
				+
			
 
				+        with open(min_max_filename_path, 'w') as f:
			
 
				+            f.write(str(min_value_interval) + '\n')
			
 
				+            f.write(str(max_value_interval) + '\n')
			
 
				+
			
 
				+    # create database using img folder (generate first time only)
			
 
				+    generate_data_model(scenes_list, p_filename, p_interval, p_kind, p_metric, scenes_selected, p_nb_zones, p_percent, p_random, p_step, p_each, p_custom)
			
 
				+
			
 
				+if __name__== "__main__":
			
 
				+    main()
			
--- a/runAll_maxwell_custom_split.sh
+++ b/runAll_maxwell_custom_split.sh
@@ -0,0 +1,24 @@
 
				+#! bin/bash
			
 
				+
			
 
				+# erase "models_info/models_comparisons.csv" file and write new header
			
 
				+file_path='models_info/models_comparisons.csv'
			
 
				+
			
 
				+erased=$1
			
 
				+
			
 
				+if [ "${erased}" == "Y" ]; then
			
 
				+    echo "Previous data file erased..."
			
 
				+    rm ${file_path}
			
 
				+    mkdir -p models_info
			
 
				+    touch ${file_path}
			
 
				+
			
 
				+    # add of header
			
 
				+    echo 'model_name; vector_size; start; end; nb_zones; metric; mode; tran_size; val_size; test_size; train_pct_size; val_pct_size; test_pct_size; train_acc; val_acc; test_acc; all_acc; F1_train; recall_train; roc_auc_train; F1_val; recall_val; roc_auc_val; F1_test; recall_test; roc_auc_test; F1_all; recall_all; roc_auc_all;' >> ${file_path}
			
 
				+
			
 
				+fi
			
 
				+
			
 
				+for size in {"4","8","16","26","32","40"}; do
			
 
				+
			
 
				+    for metric in {"lab","mscn","low_bits_2","low_bits_3","low_bits_4","low_bits_5","low_bits_6","low_bits_4_shifted_2"}; do
			
 
				+        bash generateAndTrain_maxwell_custom_split.sh ${size} ${metric}
			
 
				+    done
			
 
				+done