Browse Source

use of thresholds file when displaying data

Jérôme BUISINE 4 years ago
parent
commit
32872476cf

File diff suppressed because it is too large
+ 92 - 21
analysis/svd_reconstruction_analysis.ipynb


File diff suppressed because it is too large
+ 10 - 3
custom_config.py


+ 37 - 4
data_attributes.py

@@ -298,12 +298,12 @@ def get_image_features(data_type, block):
             
         sv_array = np.array(sv_vector)
         
-        _, len = sv_array.shape
+        _, length = sv_array.shape
         
         sv_std = []
         
         # normalize each SV vectors and compute standard deviation for each sub vectors
-        for i in range(len):
+        for i in range(length):
             sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
             sv_std.append(np.std(sv_array[:, i]))
         
@@ -340,12 +340,12 @@ def get_image_features(data_type, block):
             
         sv_array = np.array(sv_vector)
         
-        _, len = sv_array.shape
+        _, length = sv_array.shape
         
         sv_std = []
         
         # normalize each SV vectors and compute standard deviation for each sub vectors
-        for i in range(len):
+        for i in range(length):
             sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
             sv_std.append(np.std(sv_array[:, i]))
         
@@ -672,6 +672,39 @@ def get_image_features(data_type, block):
 
         data = s
 
+    if data_type == 'svd_entropy':
+        l_img = transform.get_LAB_L(block)
+
+        blocks = segmentation.divide_in_blocks(l_img, (20, 20))
+
+        values = []
+        for b in blocks:
+            sv = compression.get_SVD_s(b)
+            values.append(utils.get_entropy(sv))
+        data = np.array(values)
+
+    if data_type == 'svd_entropy_20':
+        l_img = transform.get_LAB_L(block)
+
+        blocks = segmentation.divide_in_blocks(l_img, (20, 20))
+
+        values = []
+        for b in blocks:
+            sv = compression.get_SVD_s(b)
+            values.append(utils.get_entropy(sv))
+        data = np.array(values)
+
+    if data_type == 'svd_entropy_noise_20':
+        l_img = transform.get_LAB_L(block)
+
+        blocks = segmentation.divide_in_blocks(l_img, (20, 20))
+
+        values = []
+        for b in blocks:
+            sv = compression.get_SVD_s(b)
+            sv_size = len(sv)
+            values.append(utils.get_entropy(sv[int(sv_size / 4):]))
+        data = np.array(values)
         
     return data
 

+ 0 - 231
display/display_svd_data_scene.py

@@ -1,231 +0,0 @@
-# main imports
-import sys, os, argparse
-import numpy as np
-import math
-
-# image processing imports
-from PIL import Image
-import matplotlib.pyplot as plt
-
-import ipfml.iqa.fr as fr_iqa
-from ipfml import utils
-
-# modules and config imports
-sys.path.insert(0, '') # trick to enable import of main folder module
-
-import custom_config as cfg
-from modules.utils import data as dt
-from data_attributes import get_image_features
-
-# getting configuration information
-zone_folder         = cfg.zone_folder
-min_max_filename    = cfg.min_max_filename_extension
-
-# define all scenes values
-scenes_list         = cfg.scenes_names
-scenes_indices      = cfg.scenes_indices
-choices             = cfg.normalization_choices
-path                = cfg.dataset_path
-zones               = cfg.zones_indices
-seuil_expe_filename = cfg.seuil_expe_filename
-
-features_choices    = cfg.features_choices_labels
-
-max_nb_bits         = 8
-display_error       = False
-
-
-def display_svd_values(p_scene, p_interval, p_indices, p_feature, p_mode, p_step, p_norm, p_ylim):
-    """
-    @brief Method which gives information about svd curves from zone of picture
-    @param p_scene, scene expected to show svd values
-    @param p_interval, interval [begin, end] of svd data to display
-    @param p_interval, interval [begin, end] of samples or minutes from render generation engine
-    @param p_feature, feature computed to show
-    @param p_mode, normalization's mode
-    @param p_norm, normalization or not of selected svd data
-    @param p_ylim, ylim choice to better display of data
-    @return nothing
-    """
-
-    max_value_svd = 0
-    min_value_svd = sys.maxsize
-
-    scenes = os.listdir(path)
-    # remove min max file from scenes folder
-    scenes = [s for s in scenes if min_max_filename not in s]
-
-    begin_data, end_data = p_interval
-    begin_index, end_index = p_indices
-
-    # go ahead each scenes
-    for folder_scene in scenes:
-
-        if p_scene == folder_scene:
-            scene_path = os.path.join(path, folder_scene)
-
-            # construct each zones folder name
-            zones_folder = []
-
-            # get zones list info
-            for index in zones:
-                index_str = str(index)
-                if len(index_str) < 2:
-                    index_str = "0" + index_str
-
-                current_zone = "zone"+index_str
-                zones_folder.append(current_zone)
-
-            images_data = []
-            images_indices = []
-
-            threshold_learned_zones = []
-    
-            # get all images of folder
-            scene_images = sorted([os.path.join(scene_path, img) for img in os.listdir(scene_path) if cfg.scene_image_extension in img])
-            number_scene_image = len(scene_images)
-            
-            for id, zone_folder in enumerate(zones_folder):
-
-                # get threshold information
-                zone_path = os.path.join(scene_path, zone_folder)
-                path_seuil = os.path.join(zone_path, seuil_expe_filename)
-
-                # open treshold path and get this information
-                with open(path_seuil, "r") as seuil_file:
-                    threshold_learned = int(seuil_file.readline().strip())
-                    threshold_learned_zones.append(threshold_learned)
-
-            threshold_mean = np.mean(np.asarray(threshold_learned_zones))
-            threshold_image_found = False
-
-            svd_data = []
-
-
-            # for each images
-            for id_img, img_path in enumerate(scene_images):
-                
-                current_quality_image = dt.get_scene_image_quality(img_path)
-
-                img = Image.open(img_path)
-
-                svd_values = get_image_features(p_feature, img)
-
-                if p_norm:
-                    svd_values = svd_values[begin_data:end_data]
-
-                #svd_values = np.asarray([math.log(x) for x in svd_values])
-
-                # update min max values
-                min_value = svd_values.min()
-                max_value = svd_values.max()
-
-                if min_value < min_value_svd:
-                    min_value_svd = min_value
-
-                if max_value > min_value_svd:
-                    max_value_svd = max_value
-
-                # keep in memory used data
-                if current_quality_image % p_step == 0:
-                    if current_quality_image >= begin_index and current_quality_image <= end_index:
-
-                        images_indices.append(dt.get_scene_image_postfix(img_path))
-                        svd_data.append(svd_values)
-
-                if threshold_mean < current_quality_image and not threshold_image_found:
-
-                    threshold_image_found = True
-                    threshold_image_zone = current_quality_image
-
-                    print("Quality mean : ", current_quality_image, "\n")
-                    
-                    if dt.get_scene_image_postfix(img_path) not in images_indices:
-                        images_indices.append(dt.get_scene_image_postfix(img_path))
-
-                print('%.2f%%' % ((id_img + 1) / number_scene_image * 100))
-                sys.stdout.write("\033[F")
-
-
-            # all indices of picture to plot
-            print(images_indices)
-
-            for id, data in enumerate(svd_data):
-
-                # current_data = [ math.log10(d + 1.) for d in data ]
-                # print(current_data)
-
-                current_data = data
-
-                if not p_norm:
-                    current_data = current_data[begin_data:end_data]
-
-                if p_mode == 'svdn':
-                    current_data = utils.normalize_arr(current_data)
-
-                if p_mode == 'svdne':
-                    current_data = utils.normalize_arr_with_range(current_data, min_value_svd, max_value_svd)
-
-                images_data.append(current_data)
-
-
-            # display all data using matplotlib (configure plt)
-            fig, ax = plt.subplots(figsize=(30, 22))
-            ax.set_facecolor('#FFFFFF')
-            #fig.patch.set_facecolor('#F9F9F9')
-
-            ax.tick_params(labelsize=26)
-            #plt.rc('xtick', labelsize=22)
-            #plt.rc('ytick', labelsize=22)
-
-            #plt.title(p_scene + ' scene interval information SVD['+ str(begin_data) +', '+ str(end_data) +'], from scenes indices [' + str(begin_index) + ', '+ str(end_index) + '], ' + p_feature + ' feature, ' + p_mode + ', with step of ' + str(p_step) + ', svd norm ' + str(p_norm), fontsize=24)
-            ax.set_ylabel('Component values', fontsize=36)
-            ax.set_xlabel('Singular value component indices', fontsize=36)
-
-            for id, data in enumerate(images_data):
-
-                #p_label = p_scene + "_" + images_indices[id]
-                p_label = images_indices[id] + " samples"
-
-                if int(images_indices[id]) == int(threshold_image_zone):
-                    ax.plot(data, label=p_label + " (threshold mean)", lw=6, color='red')
-                else:
-                    ax.plot(data, label=p_label)
-
-            plt.legend(bbox_to_anchor=(0.60, 0.98), loc=2, borderaxespad=0.2, fontsize=32)
-
-            start_ylim, end_ylim = p_ylim
-            ax.set_ylim(start_ylim, end_ylim)
-
-            plot_name = p_scene + '_' + p_feature + '_' + str(p_step) + '_' + p_mode + '_' + str(p_norm) + '.png'
-            plt.title('Tend of Singular values at different samples of Flat scene', fontsize=40)
-            plt.savefig(plot_name, transparent=True)
-
-def main():
-
-    parser = argparse.ArgumentParser(description="Display SVD data of scene")
-
-    parser.add_argument('--scene', type=str, help='scene index to use', choices=cfg.scenes_indices)
-    parser.add_argument('--interval', type=str, help='Interval value to keep from svd', default='"0, 200"')
-    parser.add_argument('--indices', type=str, help='Samples interval to display', default='"0, 900"')
-    parser.add_argument('--feature', type=str, help='feature data choice', choices=features_choices)
-    parser.add_argument('--mode', type=str, help='Kind of normalization level wished', choices=cfg.normalization_choices)
-    parser.add_argument('--step', type=int, help='Each step samples to display', default=10)
-    parser.add_argument('--norm', type=int, help='If values will be normalized or not', choices=[0, 1])
-    parser.add_argument('--ylim', type=str, help='ylim interval to use', default='0,1')
-
-    args = parser.parse_args()
-
-    p_scene    = scenes_list[scenes_indices.index(args.scene)]
-    p_indices  = list(map(int, args.indices.split(',')))
-    p_interval = list(map(int, args.interval.split(',')))
-    p_feature  = args.feature
-    p_mode     = args.mode
-    p_step     = args.step
-    p_norm     = args.norm
-    p_ylim     = list(map(float, args.ylim.split(',')))
-
-    display_svd_values(p_scene, p_interval, p_indices, p_feature, p_mode, p_step, p_norm, p_ylim)
-
-if __name__== "__main__":
-    main()

+ 236 - 0
display/display_svd_data_scene_file.py

@@ -0,0 +1,236 @@
+# main imports
+import sys, os, argparse
+import numpy as np
+import math
+
+# image processing imports
+from PIL import Image
+import matplotlib.pyplot as plt
+
+import ipfml.iqa.fr as fr_iqa
+from ipfml import utils
+
+# modules and config imports
+sys.path.insert(0, '') # trick to enable import of main folder module
+
+import custom_config as cfg
+from modules.utils import data as dt
+from data_attributes import get_image_features
+
+# getting configuration information
+zone_folder         = cfg.zone_folder
+min_max_filename    = cfg.min_max_filename_extension
+
+# define all scenes values
+scenes_list         = cfg.scenes_names
+scenes_indices      = cfg.scenes_indices
+choices             = cfg.normalization_choices
+zones               = cfg.zones_indices
+seuil_expe_filename = cfg.seuil_expe_filename
+
+features_choices    = cfg.features_choices_labels
+
+max_nb_bits         = 8
+display_error       = False
+
+
+def display_svd_values(p_scene, p_thresholds, p_interval, p_indices, p_feature, p_mode, p_step, p_norm, p_ylim, p_label):
+    """
+    @brief Method which gives information about svd curves from zone of picture
+    @param p_scene, scene expected to show svd values
+    @param p_interval, interval [begin, end] of svd data to display
+    @param p_interval, interval [begin, end] of samples or minutes from render generation engine
+    @param p_feature, feature computed to show
+    @param p_mode, normalization's mode
+    @param p_norm, normalization or not of selected svd data
+    @param p_ylim, ylim choice to better display of data
+    @return nothing
+    """
+
+    max_value_svd = 0
+    min_value_svd = sys.maxsize
+
+    begin_data, end_data = p_interval
+    begin_index, end_index = p_indices
+
+    # go ahead selected scene
+    scene_path = p_scene
+
+    # construct each zones folder name
+    zones_folder = []
+
+    # get zones list info
+    for index in zones:
+        index_str = str(index)
+        if len(index_str) < 2:
+            index_str = "0" + index_str
+
+        current_zone = "zone"+index_str
+        zones_folder.append(current_zone)
+
+    images_data = []
+    images_indices = []
+
+    threshold_learned_zones = []
+
+    # get all images of folder
+    scene_images = sorted([os.path.join(scene_path, img) for img in os.listdir(scene_path) if cfg.scene_image_extension in img])
+    number_scene_image = len(scene_images)
+    
+    _, scene_name = os.path.split(p_scene)
+    threshold_learned_zones = p_thresholds[scene_name]
+
+    threshold_mean = np.mean(np.asarray(threshold_learned_zones))
+    threshold_image_found = False
+
+    svd_data = []
+
+
+    # for each images
+    for id_img, img_path in enumerate(scene_images):
+        
+        current_quality_image = dt.get_scene_image_quality(img_path)
+
+        img = Image.open(img_path)
+
+        svd_values = get_image_features(p_feature, img)
+
+        if p_norm:
+            svd_values = svd_values[begin_data:end_data]
+
+        #svd_values = np.asarray([math.log(x) for x in svd_values])
+
+        # update min max values
+        min_value = svd_values.min()
+        max_value = svd_values.max()
+
+        if min_value < min_value_svd:
+            min_value_svd = min_value
+
+        if max_value > min_value_svd:
+            max_value_svd = max_value
+
+        # keep in memory used data
+        if current_quality_image % p_step == 0:
+            if current_quality_image >= begin_index and current_quality_image <= end_index:
+
+                images_indices.append(dt.get_scene_image_postfix(img_path))
+                svd_data.append(svd_values)
+
+        if threshold_mean < current_quality_image and not threshold_image_found:
+
+            threshold_image_found = True
+            threshold_image_zone = current_quality_image
+
+            print("Quality mean : ", current_quality_image, "\n")
+            
+            if dt.get_scene_image_postfix(img_path) not in images_indices:
+                images_indices.append(dt.get_scene_image_postfix(img_path))
+
+        print('%.2f%%' % ((id_img + 1) / number_scene_image * 100))
+        sys.stdout.write("\033[F")
+
+
+    # all indices of picture to plot
+    print(images_indices)
+
+    for id, data in enumerate(svd_data):
+
+        # current_data = [ math.log10(d + 1.) for d in data ]
+        # print(current_data)
+
+        current_data = data
+
+        if not p_norm:
+            current_data = current_data[begin_data:end_data]
+
+        if p_mode == 'svdn':
+            current_data = utils.normalize_arr(current_data)
+
+        if p_mode == 'svdne':
+            current_data = utils.normalize_arr_with_range(current_data, min_value_svd, max_value_svd)
+
+        images_data.append(current_data)
+
+
+    # display all data using matplotlib (configure plt)
+    fig, ax = plt.subplots(figsize=(30, 22))
+    ax.set_facecolor('#FFFFFF')
+    #fig.patch.set_facecolor('#F9F9F9')
+
+    ax.tick_params(labelsize=26)
+    #plt.rc('xtick', labelsize=22)
+    #plt.rc('ytick', labelsize=22)
+
+    #plt.title(p_scene + ' scene interval information SVD['+ str(begin_data) +', '+ str(end_data) +'], from scenes indices [' + str(begin_index) + ', '+ str(end_index) + '], ' + p_feature + ' feature, ' + p_mode + ', with step of ' + str(p_step) + ', svd norm ' + str(p_norm), fontsize=24)
+    ax.set_ylabel('Component values', fontsize=36)
+    ax.set_xlabel('Singular value component indices', fontsize=36)
+
+    for id, data in enumerate(images_data):
+
+        #p_label = p_scene + "_" + images_indices[id]
+        p_label = images_indices[id] + " samples"
+
+        if int(images_indices[id]) == int(threshold_image_zone):
+            ax.plot(data, label=p_label + " (threshold mean)", lw=8, color='red')
+        else:
+            ax.plot(data, label=p_label, lw=4)
+
+    plt.legend(bbox_to_anchor=(0.60, 0.98), loc=2, borderaxespad=0.2, fontsize=32)
+
+    start_ylim, end_ylim = p_ylim
+    ax.set_ylim(start_ylim, end_ylim)
+
+    plot_name = scene_name + '_' + p_feature + '_' + str(p_step) + '_' + p_mode + '_' + str(p_norm) + '.png'
+    plt.title('Tend of Singular values at different samples of ' + p_label + ' scene', fontsize=40)
+    plt.savefig(plot_name, transparent=True)
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Display SVD data of scene")
+
+    parser.add_argument('--scene', type=str, help='scene folder to use', required=True)
+    parser.add_argument('--thresholds', type=str, help='expected thresholds file', required=True)
+    parser.add_argument('--interval', type=str, help='Interval value to keep from svd', default='"0, 200"')
+    parser.add_argument('--indices', type=str, help='Samples interval to display', default='"0, 900"')
+    parser.add_argument('--feature', type=str, help='feature data choice', choices=features_choices)
+    parser.add_argument('--mode', type=str, help='Kind of normalization level wished', choices=cfg.normalization_choices)
+    parser.add_argument('--step', type=int, help='Each step samples to display', default=10)
+    parser.add_argument('--norm', type=int, help='If values will be normalized or not', choices=[0, 1])
+    parser.add_argument('--ylim', type=str, help='ylim interval to use', default='0,1')
+    parser.add_argument('--label', type=str, help='output label name', default="")
+
+    args = parser.parse_args()
+
+    p_scene    = args.scene
+    p_thresholds = args.thresholds
+    p_indices  = list(map(int, args.indices.split(',')))
+    p_interval = list(map(int, args.interval.split(',')))
+    p_feature  = args.feature
+    p_mode     = args.mode
+    p_step     = args.step
+    p_norm     = args.norm
+    p_ylim     = list(map(float, args.ylim.split(',')))
+    p_label    = args.label
+
+    # 1. retrieve human_thresholds
+    human_thresholds = {}
+
+    # extract thresholds
+    with open(p_thresholds) as f:
+        thresholds_line = f.readlines()
+
+        for line in thresholds_line:
+            data = line.split(';')
+            del data[-1] # remove unused last element `\n`
+            current_scene = data[0]
+            thresholds_scene = data[1:]
+
+            # TODO : check if really necessary
+            if current_scene != '50_shades_of_grey':
+                human_thresholds[current_scene] = [ int(threshold) for threshold in  thresholds_scene ]
+
+    display_svd_values(p_scene, human_thresholds, p_interval, p_indices, p_feature, p_mode, p_step, p_norm, p_ylim, p_label)
+
+if __name__== "__main__":
+    main()

+ 72 - 26
train_keras_svd.py

@@ -4,6 +4,7 @@ import argparse
 import json
 import numpy as np
 import pandas as pd
+import logging
 
 # models imports
 from keras.preprocessing.image import ImageDataGenerator
@@ -12,9 +13,10 @@ from keras.layers import Conv1D, MaxPooling1D
 from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization
 from keras.wrappers.scikit_learn import KerasClassifier
 from keras import backend as K
+from keras.callbacks import Callback
 
 from sklearn.utils import shuffle
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
 
 # modules and config imports
 import custom_config as cfg
@@ -50,6 +52,29 @@ def f1(y_true, y_pred):
     recall = recall(y_true, y_pred)
     return 2*((precision*recall)/(precision+recall+K.epsilon()))
 
+
+class IntervalEvaluation(Callback):
+    def __init__(self, validation_data=(), interval=1):
+        super(Callback, self).__init__()
+
+        self.interval = interval
+        self.X_val, self.y_val = validation_data
+
+    def on_epoch_end(self, epoch, logs={}):
+        if epoch % self.interval == 0:
+            y_pred = self.model.predict_proba(self.X_val, verbose=0)
+            y_pred = [ 0 if y < 0.5 else 1 for y in y_pred ]
+            auc_score = roc_auc_score(self.y_val, y_pred)
+            acc_score = accuracy_score(self.y_val, y_pred)
+            f1_test_score = f1_score(self.y_val, y_pred)
+            
+            print("------------------------------")
+            print("[test dataset] for epoch {:d}".format(epoch + 1))
+            print("ROC AUC : {:.6f}".format(auc_score))
+            print("ACCURACY: {:.6f}".format(acc_score))
+            print("F1 score: {:.6f}".format(f1_test_score))
+            print("------------------------------")
+
 def generate_model(input_shape):
 
     model = Sequential()
@@ -86,41 +111,41 @@ def generate_model(input_shape):
 
     model.add(Flatten(input_shape=input_shape))
 
-    model.add(Dense(2048))
-    model.add(Activation('relu'))
-    model.add(BatchNormalization())
-    model.add(Dropout(0.2))
+    # model.add(Dense(2048))
+    # model.add(Activation('relu'))
+    # model.add(BatchNormalization())
+    # model.add(Dropout(0.2))
 
     model.add(Dense(1024))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
-    model.add(Dropout(0.2))
+    model.add(Dropout(0.4))
 
     model.add(Dense(512))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
-    model.add(Dropout(0.3))
+    model.add(Dropout(0.4))
 
     model.add(Dense(256))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
-    model.add(Dropout(0.3))
+    model.add(Dropout(0.4))
 
     model.add(Dense(128))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
-    model.add(Dropout(0.3))
+    model.add(Dropout(0.4))
 
     model.add(Dense(20))
     model.add(Activation('relu'))
     model.add(BatchNormalization())
-    model.add(Dropout(0.3))
+    model.add(Dropout(0.4))
 
     model.add(Dense(1))
     model.add(Activation('sigmoid'))
 
     model.compile(loss='binary_crossentropy',
-                  optimizer='adam',
+                  optimizer='rmsprop',
                   metrics=['accuracy', f1])
 
     return model
@@ -155,30 +180,46 @@ def main():
     dataset_test = shuffle(dataset_test)
 
     # get dataset with equal number of classes occurences
-    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
-    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
     nb_noisy_train = len(noisy_df_train.index)
+    nb_not_noisy_train = len(not_noisy_df_train.index)
 
-    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
-    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
     nb_noisy_test = len(noisy_df_test.index)
+    nb_not_noisy_test = len(not_noisy_df_test.index)
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+    final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)
     final_df_test = shuffle(final_df_test)
 
-    final_df_train_size = len(final_df_train.index)
-    final_df_test_size = len(final_df_test.index)
-
     # use of the whole data set for training
-    x_dataset_train = final_df_train.ix[:,1:]
-    x_dataset_test = final_df_test.ix[:,1:]
+    x_dataset_train = final_df_train.iloc[:,1:]
+    x_dataset_test = final_df_test.iloc[:,1:]
+
+    y_dataset_train = final_df_train.iloc[:,0]
+    y_dataset_test = final_df_test.iloc[:,0]
+
+    noisy_samples = nb_noisy_test + nb_noisy_train
+    not_noisy_samples = nb_not_noisy_test + nb_not_noisy_train
+
+    total_samples = noisy_samples + not_noisy_samples
+
+    print('noisy', noisy_samples)
+    print('not_noisy', not_noisy_samples)
+    print('total', total_samples)
+
+    class_weight = {
+        0: noisy_samples / float(total_samples),
+        1: not_noisy_samples / float(total_samples)
+    }
+
+    print(class_weight)
 
-    y_dataset_train = final_df_train.ix[:,0]
-    y_dataset_test = final_df_test.ix[:,0]
 
     #######################
     # 2. Getting model
@@ -196,7 +237,9 @@ def main():
     x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), p_vector_size, 1)
     x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), p_vector_size, 1)
 
-    model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch)
+    ival = IntervalEvaluation(validation_data=(x_dataset_test, y_dataset_test), interval=1)
+
+    model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch, callbacks=[ival], class_weight=class_weight)
 
     score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=batch_size)
 
@@ -215,8 +258,11 @@ def main():
 
     # Save results obtained from model
     y_test_prediction = model.predict(x_dataset_test)
+    y_test_prediction = [ 0 if y < 0.5 else 1 for y in y_test_prediction ]
+
     print("Metrics : ", model.metrics_names)
-    print("Prediction : ", score)
+    print("ACC score : ", accuracy_score(y_dataset_test, y_test_prediction))
+    print("F1 score : ", f1_score(y_dataset_test, y_test_prediction))
     print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction))
 
 

+ 9 - 9
train_model.py

@@ -9,9 +9,9 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 
+import joblib
 import sklearn.svm as svm
 from sklearn.utils import shuffle
-from sklearn.externals import joblib
 from sklearn.metrics import accuracy_score, f1_score
 from sklearn.model_selection import cross_val_score
 
@@ -57,12 +57,12 @@ def main():
     dataset_test = shuffle(dataset_test)
 
     # get dataset with equal number of classes occurences
-    noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
-    not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
+    noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
+    not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
     nb_noisy_train = len(noisy_df_train.index)
 
-    noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
-    not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
+    noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
+    not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
     nb_noisy_test = len(noisy_df_test.index)
 
     final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
@@ -76,11 +76,11 @@ def main():
     final_df_test_size = len(final_df_test.index)
 
     # use of the whole data set for training
-    x_dataset_train = final_df_train.ix[:,1:]
-    x_dataset_test = final_df_test.ix[:,1:]
+    x_dataset_train = final_df_train.iloc[:,1:]
+    x_dataset_test = final_df_test.iloc[:,1:]
 
-    y_dataset_train = final_df_train.ix[:,0]
-    y_dataset_test = final_df_test.ix[:,0]
+    y_dataset_train = final_df_train.iloc[:,0]
+    y_dataset_test = final_df_test.iloc[:,0]
 
     #######################
     # 2. Construction of the model : Ensemble model structure