Parcourir la source

First project version; Add of dataset generation; Run script;

Jérôme BUISINE il y a 5 ans
commit
6b7c5d854b

+ 5 - 0
.gitignore

@@ -0,0 +1,5 @@
+.vscode
+*.png
+
+data
+saved_models

+ 187 - 0
analyse.R

@@ -0,0 +1,187 @@
+#!/usr/bin/Rscript
+
+args <- commandArgs(TRUE)
+dist <- read.table(args[1])
+
+#png("fig.png", width = 800, height = 400) 
+pdf("fig.pdf") 
+
+#x11()
+
+#options(devices="X11")
+
+vdist = dist$V1[1:100]
+v = ks.test(vdist,pnorm,mean(vdist),sd(vdist))
+print(v)
+hist(vdist,col = "blue",breaks = 50,freq = FALSE)
+den <- density(vdist)
+lines(den, col = "red")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+dev.off()

+ 99 - 0
make_dataset.py

@@ -0,0 +1,99 @@
+import numpy as np
+import pandas as pd
+
+import os, sys, argparse
+
+import modules.config as cfg
+
+def compute_files(_n, _each_row, _each_column):
+    """
+    Read all folders and files of scenes in order to compute output dataset
+    """
+    
+    output_dataset_filename = cfg.output_file_prefix + _n + '_column_' + _each_column + '_row_' + _each_row + '.csv'
+    
+    output_dataset_filename = os.path.join(cfg.output_data_folder, output_dataset_filename)
+
+    if not os.path.exists(cfg.output_data_folder):
+        os.makedirs(cfg.output_data_folder)
+
+    output_file = open(output_dataset_filename, 'w')
+
+    print('Preparing to store data into ', output_dataset_filename)
+
+    scenes = os.listdir(cfg.folder_scenes_path)
+
+    # remove min max file from scenes folder
+    scenes = [s for s in scenes if s not in cfg.folder_and_files_filtered]
+    scenes = [s for s in scenes if '.csv' not in s] # do not keep generated .csv file
+
+    print(scenes)
+
+    counter = 0
+    number_of_elements = len(scenes) * cfg.number_of_rows * cfg.number_of_columns
+    print(number_of_elements, ' to manage')
+
+    for scene in scenes:
+
+        scene_path = os.path.join(cfg.folder_scenes_path, scene)
+        columns_folder = os.listdir(scene_path)
+
+        for id_column, column in enumerate(columns_folder):
+            
+            if id_column % int(_each_column) == 0 :
+
+                folder_path = os.path.join(scene_path, column)
+
+                pixel_files_list = os.listdir(folder_path)
+
+                for id_row, pixel_file in enumerate(pixel_files_list):
+                    
+                    if id_row % int(_each_row) == 0:
+                        pixel_file_path = os.path.join(folder_path, pixel_file)
+
+                        saved_row = ''
+
+                        # for each file read content, keep `n` first values and compute mean
+                        with open(pixel_file_path, 'r') as f:
+                            lines = [float(l)/255. for l in f.readlines()]
+
+                            pixel_values = lines[0:int(_n)]
+                            mean = sum(lines) / float(len(lines))
+
+                            saved_row += str(mean)
+
+                            for val in pixel_values:
+                                saved_row += ';' + str(val)
+                            
+                            saved_row += '\n'
+
+                        # store mean and pixel values into .csv row
+                        output_file.write(saved_row)
+
+                    counter = counter + 1
+            else:
+                counter += cfg.number_of_rows
+
+            print("{0:.2f}%".format(counter / number_of_elements * 100))
+            sys.stdout.write("\033[F")
+
+    print('\n')
+    output_file.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Compute .csv dataset file")
+
+    parser.add_argument('--n', type=str, help='Number of pixel values approximated to keep')
+    parser.add_argument('--each_row', type=str, help='Keep only values from specific row', default=1)
+    parser.add_argument('--each_column', type=str, help='Keep only values from specific column', default=1)
+    args = parser.parse_args()
+
+    param_n = args.n
+    param_each_row = args.each_row
+    param_each_column = args.each_column
+
+    compute_files(param_n, param_each_row, param_each_column)
+
+if __name__== "__main__":
+    main()

+ 1 - 0
models_info/models_comparisons.csv

@@ -0,0 +1 @@
+10_column_8_row_7_SGD.joblib;0.9708138110211402;

+ 0 - 0
modules/__init__.py


BIN
modules/__pycache__/__init__.cpython-36.pyc


BIN
modules/__pycache__/config.cpython-36.pyc


BIN
modules/__pycache__/metrics.cpython-36.pyc


+ 14 - 0
modules/config.py

@@ -0,0 +1,14 @@
+output_data_folder              = "data"
+folder_scenes_path              = ".."
+models_information_folder       = 'models_info'
+saved_models_folder             = 'saved_models'
+
+output_file_prefix              = "dataset_"
+folder_and_files_filtered       = ["analyse", "make_dataset.py", ".vscode"]
+
+number_of_rows                  = 512
+number_of_columns               = 512
+
+kind_of_models                  = ["SGD", "Ridge", "SVR"]
+
+global_result_filepath          = "models_info/models_comparisons.csv"

+ 17 - 0
modules/metrics.py

@@ -0,0 +1,17 @@
+import numpy as np
+
+def coefficient_of_determination(_y, _predicted):
+    
+    y = np.asarray(_y)
+    predicted = np.asarray(_predicted)
+
+    y_mean = y.mean()
+
+    numerator_sum = 0
+    denominator_sum = 0
+
+    for id_val, val in enumerate(y):
+        numerator_sum += (predicted[id_val] - y_mean) * (predicted[id_val] - y_mean)
+        denominator_sum += (val - y_mean) * (val - y_mean)
+    
+    return numerator_sum / denominator_sum

+ 27 - 0
run.sh

@@ -0,0 +1,27 @@
+# erase "models_info/models_comparisons.csv" file and write new header
+file_path='models_info/models_comparisons.csv'
+
+erased=$1
+
+if [ "${erased}" == "Y" ]; then
+    echo "Previous data file erased..."
+    rm ${file_path}
+    mkdir -p models_info
+    touch ${file_path}
+
+    # add of header
+    echo 'model_name; coeff_of_determination;' >> ${file_path}
+fi
+
+for model in {"SGD","Ridge","SVR"}; do
+    for row in {7,8,9,10}; do
+        for column in {7,8,9,10}; do
+
+            # Run creation of dataset and train model
+            DATASET_NAME="data/dataset_10_column_${column}_row_${row}.csv"
+
+            python make_dataset.py --n 10 --each_row ${row} --each_column ${column}
+            python train_model.py --data ${DATASET_NAME} --model ${model}
+        done
+    done
+done

+ 81 - 0
train_model.py

@@ -0,0 +1,81 @@
+import numpy as np
+import pandas as pd
+
+import os, sys, argparse
+
+from sklearn import linear_model
+from sklearn import svm
+from sklearn.utils import shuffle
+
+import modules.config as cfg
+import modules.metrics as metrics
+
+from joblib import dump, load
+
+def get_model_choice(_model_name):
+    """
+    Bind choose model using String information
+    """
+
+    if _model_name == "SGD":
+        clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
+
+    if _model_name == "Ridge":
+        clf = linear_model.Ridge(alpha=1.)
+
+    if _model_name == "SVR":
+        clf = svm.SVR()
+
+    return clf
+
+def train(_data_file, _model_name):
+
+    # prepare data
+    dataset = pd.read_csv(_data_file, header=None, sep=";")
+    dataset = shuffle(dataset)
+
+    y = dataset.ix[:,0]
+    X = dataset.ix[:,1:]
+
+    clf = get_model_choice(_model_name)
+    clf.fit(X, y)
+
+    y_predicted = clf.predict(X)
+
+    coeff = metrics.coefficient_of_determination(y, y_predicted)
+
+    print("Predicted coefficient of determination for ", _model_name, " : ", coeff)
+
+    # save the trained model, so check if saved folder exists 
+    if not os.path.exists(cfg.saved_models_folder):
+        os.makedirs(cfg.saved_models_folder)
+
+    # compute model filename
+    model_filename = _data_file.split('/')[-1].replace(cfg.output_file_prefix, '').replace('.csv', '') 
+    model_filename = model_filename + '_' + _model_name + '.joblib'
+
+    model_file_path = os.path.join(cfg.saved_models_folder, model_filename)
+    print("Model will be save into `", model_file_path, '`')
+    
+    dump(clf, model_file_path)
+
+    # save score into global_result.csv file
+    with open(cfg.global_result_filepath, "w") as f:
+       f.write(model_filename + ';' + str(coeff) + ';\n')
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Train model and saved it")
+
+    parser.add_argument('--data', type=str, help='Filename of dataset')
+    parser.add_argument('--model', type=str, help='Kind of model expected', choices=cfg.kind_of_models)
+    
+    args = parser.parse_args()
+
+    param_data_file = args.data
+    param_model = args.model
+
+    train(param_data_file, param_model)
+
+if __name__== "__main__":
+    main()