Browse Source

update prediction script for new dataset structure

Jérôme BUISINE 2 months ago
parent
commit
c409191b1c

+ 1 - 1
custom_config.py

@@ -22,7 +22,7 @@ results_information_folder      = os.path.join(output_data_folder, 'results')
 ## correlation_indices_folder      = 'corr_indices'
 
 # variables
-features_choices_labels                 = features_choices_labels + ['filters_statistics']
+features_choices_labels                 = features_choices_labels + ['filters_statistics', 'statistics_extended']
 optimization_filters_result_filename    = 'optimization_comparisons_filters.csv'
 optimization_attributes_result_filename = 'optimization_comparisons_attributes.csv'
 

+ 7 - 4
data_attributes.py

@@ -99,15 +99,18 @@ def get_image_features(data_type, block):
         bytes_data = np.array(block).tobytes()
         compress_data = gzip.compress(bytes_data)
 
-        data.append(data, sys.getsizeof(compress_data))
+        data = np.append(data, sys.getsizeof(compress_data))
+
+        lab_img = transform.get_LAB_L(block)
+        arr = np.array(lab_img)
 
         # add sobel complexity (kernel size of 5)
-        sobelx = cv2.Sobel(lab_img, cv2.CV_64F, 1, 0, ksize=5)
-        sobely = cv2.Sobel(lab_img, cv2.CV_64F, 0, 1,ksize=5)
+        sobelx = cv2.Sobel(arr, cv2.CV_64F, 1, 0, ksize=5)
+        sobely = cv2.Sobel(arr, cv2.CV_64F, 0, 1,ksize=5)
 
         sobel_mag = np.array(np.hypot(sobelx, sobely), 'uint8')  # magnitude
 
-        data.append(data, np.std(sobel_mag))
+        data = np.append(data, np.std(sobel_mag))
 
     if 'lab' in data_type:
 

+ 0 - 58
data_processing/generateAndTrain_maxwell_custom.sh

@@ -1,58 +0,0 @@
-#! bin/bash
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need of vector size"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of feature information"
-    exit 1
-fi
-
-if [ -z "$3" ]
-  then
-    echo "No argument supplied"
-    echo "Need of kind of data to use"
-    exit 1
-fi
-
-size=$1
-feature=$2
-data=$3
-
-# selection of four scenes (only maxwell)
-scenes="A, D, G, H"
-
-start=0
-end=$size
-
-for nb_zones in {4,6,8,10,11,12}; do
-
-    for mode in {"svd","svdn","svdne"}; do
-        for model in {"svm_model","ensemble_model","ensemble_model_v2"}; do
-
-            FILENAME="data/${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}"
-            MODEL_NAME="${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}"
-            CUSTOM_MIN_MAX_FILENAME="N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}_min_max"
-
-            echo $FILENAME
-
-            # only compute if necessary (perhaps server will fall.. Just in case)
-            if grep -q "${MODEL_NAME}" "${result_filename}"; then
-
-                echo "${MODEL_NAME} results already generated..."
-            else
-                python generate/generate_data_model_random_${data}.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --feature ${feature} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --renderer "maxwell" --step 10 --random 1 --custom ${CUSTOM_MIN_MAX_FILENAME}
-                #python train_model.py --data ${FILENAME} --output ${MODEL_NAME} --choice ${model}
-
-                #python prediction/predict_seuil_expe_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --feature ${feature} --limit_detection '2' --custom ${CUSTOM_MIN_MAX_FILENAME}
-                #python others/save_model_result_in_md_maxwell.py --interval "${start},${end}" --model "saved_models/${MODEL_NAME}.joblib" --mode "${mode}" --feature ${feature}
-            fi
-        done
-    done
-done

+ 0 - 68
data_processing/generateAndTrain_maxwell_custom_optimization.sh

@@ -1,68 +0,0 @@
-#! bin/bash
-
-if [ -z "$1" ]
-  then
-    echo "No argument supplied"
-    echo "Need of vector size"
-    exit 1
-fi
-
-if [ -z "$2" ]
-  then
-    echo "No argument supplied"
-    echo "Need of feature information"
-    exit 1
-fi
-
-if [ -z "$3" ]
-  then
-    echo "No argument supplied"
-    echo "Need of kind of data to use"
-    exit 1
-fi
-
-if [ -z "$4" ]
-  then
-    echo "No argument supplied"
-    echo "Use of filters or attributes"
-    exit 1
-fi
-
-
-size=$1
-feature=$2
-data=$3
-filter=$4
-
-
-# selection of four scenes (only maxwell)
-scenes="A, D, G, H"
-result_filename="results/optimization_comparisons_${filter}.csv"
-start=0
-end=$size
-
-#for nb_zones in {4,6,8,10,12}; do
-for nb_zones in {10,12}; do
-
-    for mode in {"svd","svdn","svdne"}; do
-        for model in {"svm_model","ensemble_model","ensemble_model_v2"}; do
-      
-            FILENAME="data/${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}_${filter}"
-            MODEL_NAME="${model}_N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}_${filter}"
-            CUSTOM_MIN_MAX_FILENAME="N${size}_B${start}_E${end}_nb_zones_${nb_zones}_${feature}_${mode}_${data}_${filter}_min_max"
-
-            echo $FILENAME
-
-            # only compute if necessary (perhaps server will fall.. Just in case)
-            if grep -q "${MODEL_NAME}" "${result_filename}"; then
-
-                echo "${MODEL_NAME} results already generated..."
-            else
-                python generate/generate_data_model_random_${data}.py --output ${FILENAME} --interval "${start},${end}" --kind ${mode} --feature ${feature} --scenes "${scenes}" --nb_zones "${nb_zones}" --percent 1 --renderer "maxwell" --step 40 --random 1 --custom ${CUSTOM_MIN_MAX_FILENAME}
-                
-                echo "Train ${MODEL_NAME}"
-                #python find_best_${filter}.py --data ${FILENAME} --choice ${model} &
-            fi
-        done
-    done
-done

+ 5 - 5
find_best_attributes.py

@@ -13,9 +13,9 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
+import joblib
 import sklearn.svm as svm
 from sklearn.utils import shuffle
-from sklearn.externals import joblib
 from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import cross_val_score
 
@@ -39,8 +39,8 @@ from optimization.checkpoints.BasicCheckpoint import BasicCheckpoint
 # variables and parameters
 models_list         = cfg.models_names_list
 number_of_values    = 26
-ils_iteration       = 1000
-ls_iteration        = 20
+ils_iteration       = 10
+ls_iteration        = 5
 
 # default validator
 def validator(solution):
@@ -52,7 +52,7 @@ def validator(solution):
 
 # init solution (26 attributes)
 def init():
-    return BinarySolution([], number_of_values).random(validator)
+    return BinarySolution([], 26).random(validator)
 
 def loadDataset(filename):
 
@@ -117,7 +117,7 @@ def main():
     if not os.path.exists(cfg.output_logs_folder):
         os.makedirs(cfg.output_logs_folder)
 
-    logging.basicConfig(format='%(asctime)s %(message)s', filename='logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
+    logging.basicConfig(format='%(asctime)s %(message)s', filename='data/logs/%s.log' % p_data_file.split('/')[-1], level=logging.DEBUG)
 
     # define evaluate function here (need of data information)
     def evaluate(solution):

+ 6 - 1
generate/generate_all_data_file.py

@@ -55,6 +55,7 @@ def generate_data_svd(data_type, mode, dataset, output):
 
         print(folder_scene)
         scene_path = os.path.join(dataset, folder_scene)
+        output_scene_path = os.path.join(output_data_folder, output, folder_scene)
 
         # getting output filename
         output_svd_filename = data_type + "_" + mode + generic_output_file_svd
@@ -72,7 +73,11 @@ def generate_data_svd(data_type, mode, dataset, output):
             current_zone = "zone"+index_str
             zones_folder.append(current_zone)
 
-            zone_path = os.path.join(scene_path, current_zone)
+            zone_path = os.path.join(output_scene_path, current_zone)
+
+            if not os.path.exists(zone_path):
+                os.makedirs(zone_path)
+
             svd_file_path = os.path.join(zone_path, output_svd_filename)
 
             # add writer into list

+ 5 - 4
models.py

@@ -10,13 +10,14 @@ import sklearn.svm as svm
 
 def _get_best_model(X_train, y_train):
 
-    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-    #Cs = [1, 2, 4, 8, 16, 32]
-    gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
+    #Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+    Cs = [1, 2, 4, 8, 16, 32]
+    # gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
+    gammas = [0.001, 0.1, 1, 10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
 
     svc = svm.SVC(probability=True)
-    clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=2)
+    clf = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=2)
 
     clf.fit(X_train, y_train)
 

+ 169 - 0
prediction/estimate_thresholds.py

@@ -0,0 +1,169 @@
+# main imports
+import numpy as np
+import pandas as pd
+import sys, os, argparse
+
+# image processing
+from PIL import Image
+from ipfml import utils
+from ipfml.processing import transform, segmentation
+
+import matplotlib.pyplot as plt
+
+# model imports
+import joblib
+
+# modules and config imports
+sys.path.insert(0, '') # trick to enable import of main folder module
+
+import custom_config as cfg
+from modules.utils import data as dt
+
+from data_attributes import get_image_features
+
+zones_indices  = cfg.zones_indices
+
+def write_progress(progress):
+    barWidth = 180
+
+    output_str = "["
+    pos = barWidth * progress
+    for i in range(barWidth):
+        if i < pos:
+           output_str = output_str + "="
+        elif i == pos:
+           output_str = output_str + ">"
+        else:
+            output_str = output_str + " "
+
+    output_str = output_str + "] " + str(int(progress * 100.0)) + " %\r"
+    print(output_str)
+    sys.stdout.write("\033[F")
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Read and compute model on scene in order to make predictions")
+
+    parser.add_argument('--folder', type=str, help='folder where scene data are stored', required=True)
+    parser.add_argument('--model', type=str, help='model file', required=True)
+    parser.add_argument('--solution', type=str, help='Data of solution to specify filters to use', required=True)
+    parser.add_argument('--method', type=str, help='method name to used', choices=cfg.features_choices_labels, default=cfg.features_choices_labels[0], required=True)
+    parser.add_argument('--kind', type=str, help='Kind of normalization level wished', choices=cfg.normalization_choices, required=True)
+    parser.add_argument('--n_stop', type=int, help='n consecutive prediction to stop', default=1)
+    parser.add_argument('--custom', type=str, help='Name of custom min max file if use of renormalization of data', default='')
+    parser.add_argument('--save', type=str, help='filename where to save input data', required=True)
+    parser.add_argument('--label', type=str, help='label to use when saving thresholds', required=True)
+
+    args = parser.parse_args()
+
+    p_model    = args.model
+    p_solution   = list(map(int, args.solution.split(' ')))
+    p_method   = args.method
+    p_n_stop   = args.n_stop
+    p_folder   = args.folder
+    p_mode     = args.kind
+    p_custom   = args.custom
+    p_save     = args.save
+    p_label    = args.label
+
+    if len(p_custom) > 0:
+        # need to read min_max_file
+        with open(p_custom, 'r') as f:
+            min_val = float(f.readline().replace('\n', ''))
+            max_val = float(f.readline().replace('\n', ''))
+
+    # 1. get scene name
+    scene_path = p_folder
+
+    # 2. load model and compile it
+
+    # TODO : check kind of model
+    model = joblib.load(p_model)
+    # model.compile(loss='binary_crossentropy',
+    #               optimizer='rmsprop',
+    #               metrics=['accuracy'])
+
+    # 3. get indices kept by solution
+    # get indices of attributes data to use (attributes selection from solution)
+    indices = []
+
+    for index, value in enumerate(p_solution): 
+        if value == 1: 
+            indices.append(index)
+
+    # 4. prepare scene to predict
+    estimated_thresholds = []
+    n_estimated_thresholds = []
+    zones_list = np.arange(16)
+
+    # 4. get estimated thresholds using model and specific method
+    images_path = sorted([os.path.join(scene_path, img) for img in os.listdir(scene_path) if cfg.scene_image_extension in img])
+    number_of_images = len(images_path)
+    image_indices = [ dt.get_scene_image_quality(img_path) for img_path in images_path ]
+
+    image_counter = 0
+
+
+    # append empty list
+    for _ in zones_list:
+        estimated_thresholds.append(None)
+        n_estimated_thresholds.append(0)
+
+    for img_i, img_path in enumerate(images_path):
+
+        blocks = segmentation.divide_in_blocks(Image.open(img_path), (200, 200))
+
+        for index, block in enumerate(blocks):
+            
+            if estimated_thresholds[index] is None:
+                
+                # check if prediction is possible
+                data = np.array(get_image_features(p_method, np.array(block)))
+
+                if p_mode == 'svdn':
+                    data = utils.normalize_arr_with_range(data)
+
+                if p_mode == 'svdne':
+                    data = utils.normalize_arr_with_range(data, min_val, max_val)
+
+                data = np.array(data)[indices]
+
+                #data = np.expand_dims(data, axis=0)
+                #print(data.shape)
+                
+                prob = model.predict(np.array(data).reshape(1, -1))[0]
+                #print(index, ':', image_indices[img_i], '=>', prob)
+
+                if prob < 0.5:
+                    n_estimated_thresholds[index] += 1
+
+                    # if same number of detection is attempted
+                    if n_estimated_thresholds[index] >= p_n_stop:
+                        estimated_thresholds[index] = image_indices[img_i]
+                else:
+                    n_estimated_thresholds[index] = 0
+
+        # write progress bar
+        write_progress((image_counter + 1) / number_of_images)
+        
+        image_counter = image_counter + 1
+    
+    # default label
+    for i, _ in enumerate(zones_list):
+        if estimated_thresholds[i] == None:
+            estimated_thresholds[i] = image_indices[-1]
+
+    # 6. save estimated thresholds into specific file
+    print(estimated_thresholds)
+    print(p_save)
+    if p_save is not None:
+        with open(p_save, 'a') as f:
+            f.write(p_label + ';')
+
+            for t in estimated_thresholds:
+                f.write(str(t) + ';')
+            f.write('\n')
+    
+
+if __name__== "__main__":
+    main()

+ 1 - 1
requirements.txt

@@ -10,4 +10,4 @@ matplotlib
 path.py
 pandas
 opencv-python
-gzip
+joblib

+ 1 - 1
train_model.py

@@ -23,7 +23,7 @@ import custom_config as cfg
 import models as mdl
 
 # variables and parameters
-saved_models_folder = cfg.saved_models_folder
+saved_models_folder = cfg.output_models
 models_list         = cfg.models_names_list
 
 current_dirpath     = os.getcwd()

+ 11 - 11
train_model_attributes.py

@@ -9,9 +9,9 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
+import joblib
 import sklearn.svm as svm
 from sklearn.utils import shuffle
-from sklearn.externals import joblib
 from sklearn.metrics import accuracy_score, f1_score
 from sklearn.model_selection import cross_val_score
 
@@ -22,7 +22,7 @@ import custom_config as cfg
 import models as mdl
 
 # variables and parameters
-saved_models_folder = cfg.saved_models_folder
+saved_models_folder = cfg.output_models
 models_list         = cfg.models_names_list
 
 current_dirpath     = os.getcwd()
@@ -33,7 +33,7 @@ def main():
 
     parser = argparse.ArgumentParser(description="Train SKLearn model and save it into .joblib file")
 
-    parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
+    parser.add_argument('--data', type=str, help='dataset filename prefiloc (without .train and .test)')
     parser.add_argument('--output', type=str, help='output file name desired for model (without .joblib extension)')
     parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list)
     parser.add_argument('--solution', type=str, help='Data of solution to specify filters to use')
@@ -59,12 +59,12 @@ def main():
     dataset_test = shuffle(dataset_test)
 
     # get dataset with equal number of classes occurences
-    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
-    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
     nb_noisy_train = len(noisy_df_train.index)
 
-    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
-    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
     nb_noisy_test = len(noisy_df_test.index)
 
     final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
@@ -78,11 +78,11 @@ def main():
     final_df_test_size = len(final_df_test.index)
 
     # use of the whole data set for training
-    x_dataset_train = final_df_train.ix[:,1:]
-    x_dataset_test = final_df_test.ix[:,1:]
+    x_dataset_train = final_df_train.iloc[:,1:]
+    x_dataset_test = final_df_test.iloc[:,1:]
 
-    y_dataset_train = final_df_train.ix[:,0]
-    y_dataset_test = final_df_test.ix[:,0]
+    y_dataset_train = final_df_train.iloc[:,0]
+    y_dataset_test = final_df_test.iloc[:,0]
 
     # get indices of filters data to use (filters selection from solution)
     indices = []

+ 2 - 2
train_model_filters.py

@@ -9,9 +9,9 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
+import joblib
 import sklearn.svm as svm
 from sklearn.utils import shuffle
-from sklearn.externals import joblib
 from sklearn.metrics import accuracy_score, f1_score
 from sklearn.model_selection import cross_val_score
 
@@ -22,7 +22,7 @@ import custom_config as cfg
 import models as mdl
 
 # variables and parameters
-saved_models_folder = cfg.saved_models_folder
+saved_models_folder = cfg.output_models
 models_list         = cfg.models_names_list
 
 current_dirpath     = os.getcwd()