il y a 5 ans · 6b7c5d854b
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
 
				+.vscode
			
 
				+*.png
			
 
				+
			
 
				+data
			
 
				+saved_models
			
--- a/analyse.R
+++ b/analyse.R
@@ -0,0 +1,187 @@
 
				+#!/usr/bin/Rscript
			
 
				+
			
 
				+args <- commandArgs(TRUE)
			
 
				+dist <- read.table(args[1])
			
 
				+
			
 
				+#png("fig.png", width = 800, height = 400) 
			
 
				+pdf("fig.pdf") 
			
 
				+
			
 
				+#x11()
			
 
				+
			
 
				+#options(devices="X11")
			
 
				+
			
 
				+vdist = dist$V1[1:100]
			
 
				+v = ks.test(vdist,pnorm,mean(vdist),sd(vdist))
			
 
				+print(v)
			
 
				+hist(vdist,col = "blue",breaks = 50,freq = FALSE)
			
 
				+den <- density(vdist)
			
 
				+lines(den, col = "red")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+dev.off()
			
--- a/make_dataset.py
+++ b/make_dataset.py
@@ -0,0 +1,99 @@
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+import os, sys, argparse
			
 
				+
			
 
				+import modules.config as cfg
			
 
				+
			
 
				+def compute_files(_n, _each_row, _each_column):
			
 
				+    """
			
 
				+    Read all folders and files of scenes in order to compute output dataset
			
 
				+    """
			
 
				+    
			
 
				+    output_dataset_filename = cfg.output_file_prefix + _n + '_column_' + _each_column + '_row_' + _each_row + '.csv'
			
 
				+    
			
 
				+    output_dataset_filename = os.path.join(cfg.output_data_folder, output_dataset_filename)
			
 
				+
			
 
				+    if not os.path.exists(cfg.output_data_folder):
			
 
				+        os.makedirs(cfg.output_data_folder)
			
 
				+
			
 
				+    output_file = open(output_dataset_filename, 'w')
			
 
				+
			
 
				+    print('Preparing to store data into ', output_dataset_filename)
			
 
				+
			
 
				+    scenes = os.listdir(cfg.folder_scenes_path)
			
 
				+
			
 
				+    # remove min max file from scenes folder
			
 
				+    scenes = [s for s in scenes if s not in cfg.folder_and_files_filtered]
			
 
				+    scenes = [s for s in scenes if '.csv' not in s] # do not keep generated .csv file
			
 
				+
			
 
				+    print(scenes)
			
 
				+
			
 
				+    counter = 0
			
 
				+    number_of_elements = len(scenes) * cfg.number_of_rows * cfg.number_of_columns
			
 
				+    print(number_of_elements, ' to manage')
			
 
				+
			
 
				+    for scene in scenes:
			
 
				+
			
 
				+        scene_path = os.path.join(cfg.folder_scenes_path, scene)
			
 
				+        columns_folder = os.listdir(scene_path)
			
 
				+
			
 
				+        for id_column, column in enumerate(columns_folder):
			
 
				+            
			
 
				+            if id_column % int(_each_column) == 0 :
			
 
				+
			
 
				+                folder_path = os.path.join(scene_path, column)
			
 
				+
			
 
				+                pixel_files_list = os.listdir(folder_path)
			
 
				+
			
 
				+                for id_row, pixel_file in enumerate(pixel_files_list):
			
 
				+                    
			
 
				+                    if id_row % int(_each_row) == 0:
			
 
				+                        pixel_file_path = os.path.join(folder_path, pixel_file)
			
 
				+
			
 
				+                        saved_row = ''
			
 
				+
			
 
				+                        # for each file read content, keep `n` first values and compute mean
			
 
				+                        with open(pixel_file_path, 'r') as f:
			
 
				+                            lines = [float(l)/255. for l in f.readlines()]
			
 
				+
			
 
				+                            pixel_values = lines[0:int(_n)]
			
 
				+                            mean = sum(lines) / float(len(lines))
			
 
				+
			
 
				+                            saved_row += str(mean)
			
 
				+
			
 
				+                            for val in pixel_values:
			
 
				+                                saved_row += ';' + str(val)
			
 
				+                            
			
 
				+                            saved_row += '\n'
			
 
				+
			
 
				+                        # store mean and pixel values into .csv row
			
 
				+                        output_file.write(saved_row)
			
 
				+
			
 
				+                    counter = counter + 1
			
 
				+            else:
			
 
				+                counter += cfg.number_of_rows
			
 
				+
			
 
				+            print("{0:.2f}%".format(counter / number_of_elements * 100))
			
 
				+            sys.stdout.write("\033[F")
			
 
				+
			
 
				+    print('\n')
			
 
				+    output_file.close()
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="Compute .csv dataset file")
			
 
				+
			
 
				+    parser.add_argument('--n', type=str, help='Number of pixel values approximated to keep')
			
 
				+    parser.add_argument('--each_row', type=str, help='Keep only values from specific row', default=1)
			
 
				+    parser.add_argument('--each_column', type=str, help='Keep only values from specific column', default=1)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    param_n = args.n
			
 
				+    param_each_row = args.each_row
			
 
				+    param_each_column = args.each_column
			
 
				+
			
 
				+    compute_files(param_n, param_each_row, param_each_column)
			
 
				+
			
 
				+if __name__== "__main__":
			
 
				+    main()
			
--- a/models_info/models_comparisons.csv
+++ b/models_info/models_comparisons.csv
@@ -0,0 +1 @@
 
				+10_column_8_row_7_SGD.joblib;0.9708138110211402;
			
--- a/modules/__init__.py
+++ b/modules/__init__.py
--- a/modules/__pycache__/__init__.cpython-36.pyc
+++ b/modules/__pycache__/__init__.cpython-36.pyc
--- a/modules/__pycache__/config.cpython-36.pyc
+++ b/modules/__pycache__/config.cpython-36.pyc
--- a/modules/__pycache__/metrics.cpython-36.pyc
+++ b/modules/__pycache__/metrics.cpython-36.pyc
--- a/modules/config.py
+++ b/modules/config.py
@@ -0,0 +1,14 @@
 
				+output_data_folder              = "data"
			
 
				+folder_scenes_path              = ".."
			
 
				+models_information_folder       = 'models_info'
			
 
				+saved_models_folder             = 'saved_models'
			
 
				+
			
 
				+output_file_prefix              = "dataset_"
			
 
				+folder_and_files_filtered       = ["analyse", "make_dataset.py", ".vscode"]
			
 
				+
			
 
				+number_of_rows                  = 512
			
 
				+number_of_columns               = 512
			
 
				+
			
 
				+kind_of_models                  = ["SGD", "Ridge", "SVR"]
			
 
				+
			
 
				+global_result_filepath          = "models_info/models_comparisons.csv"
			
--- a/modules/metrics.py
+++ b/modules/metrics.py
@@ -0,0 +1,17 @@
 
				+import numpy as np
			
 
				+
			
 
				+def coefficient_of_determination(_y, _predicted):
			
 
				+    
			
 
				+    y = np.asarray(_y)
			
 
				+    predicted = np.asarray(_predicted)
			
 
				+
			
 
				+    y_mean = y.mean()
			
 
				+
			
 
				+    numerator_sum = 0
			
 
				+    denominator_sum = 0
			
 
				+
			
 
				+    for id_val, val in enumerate(y):
			
 
				+        numerator_sum += (predicted[id_val] - y_mean) * (predicted[id_val] - y_mean)
			
 
				+        denominator_sum += (val - y_mean) * (val - y_mean)
			
 
				+    
			
 
				+    return numerator_sum / denominator_sum
			
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,27 @@
 
				+# erase "models_info/models_comparisons.csv" file and write new header
			
 
				+file_path='models_info/models_comparisons.csv'
			
 
				+
			
 
				+erased=$1
			
 
				+
			
 
				+if [ "${erased}" == "Y" ]; then
			
 
				+    echo "Previous data file erased..."
			
 
				+    rm ${file_path}
			
 
				+    mkdir -p models_info
			
 
				+    touch ${file_path}
			
 
				+
			
 
				+    # add of header
			
 
				+    echo 'model_name; coeff_of_determination;' >> ${file_path}
			
 
				+fi
			
 
				+
			
 
				+for model in {"SGD","Ridge","SVR"}; do
			
 
				+    for row in {7,8,9,10}; do
			
 
				+        for column in {7,8,9,10}; do
			
 
				+
			
 
				+            # Run creation of dataset and train model
			
 
				+            DATASET_NAME="data/dataset_10_column_${column}_row_${row}.csv"
			
 
				+
			
 
				+            python make_dataset.py --n 10 --each_row ${row} --each_column ${column}
			
 
				+            python train_model.py --data ${DATASET_NAME} --model ${model}
			
 
				+        done
			
 
				+    done
			
 
				+done
			
--- a/train_model.py
+++ b/train_model.py
@@ -0,0 +1,81 @@
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+import os, sys, argparse
			
 
				+
			
 
				+from sklearn import linear_model
			
 
				+from sklearn import svm
			
 
				+from sklearn.utils import shuffle
			
 
				+
			
 
				+import modules.config as cfg
			
 
				+import modules.metrics as metrics
			
 
				+
			
 
				+from joblib import dump, load
			
 
				+
			
 
				+def get_model_choice(_model_name):
			
 
				+    """
			
 
				+    Bind choose model using String information
			
 
				+    """
			
 
				+
			
 
				+    if _model_name == "SGD":
			
 
				+        clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
			
 
				+
			
 
				+    if _model_name == "Ridge":
			
 
				+        clf = linear_model.Ridge(alpha=1.)
			
 
				+
			
 
				+    if _model_name == "SVR":
			
 
				+        clf = svm.SVR()
			
 
				+
			
 
				+    return clf
			
 
				+
			
 
				+def train(_data_file, _model_name):
			
 
				+
			
 
				+    # prepare data
			
 
				+    dataset = pd.read_csv(_data_file, header=None, sep=";")
			
 
				+    dataset = shuffle(dataset)
			
 
				+
			
 
				+    y = dataset.ix[:,0]
			
 
				+    X = dataset.ix[:,1:]
			
 
				+
			
 
				+    clf = get_model_choice(_model_name)
			
 
				+    clf.fit(X, y)
			
 
				+
			
 
				+    y_predicted = clf.predict(X)
			
 
				+
			
 
				+    coeff = metrics.coefficient_of_determination(y, y_predicted)
			
 
				+
			
 
				+    print("Predicted coefficient of determination for ", _model_name, " : ", coeff)
			
 
				+
			
 
				+    # save the trained model, so check if saved folder exists 
			
 
				+    if not os.path.exists(cfg.saved_models_folder):
			
 
				+        os.makedirs(cfg.saved_models_folder)
			
 
				+
			
 
				+    # compute model filename
			
 
				+    model_filename = _data_file.split('/')[-1].replace(cfg.output_file_prefix, '').replace('.csv', '') 
			
 
				+    model_filename = model_filename + '_' + _model_name + '.joblib'
			
 
				+
			
 
				+    model_file_path = os.path.join(cfg.saved_models_folder, model_filename)
			
 
				+    print("Model will be save into `", model_file_path, '`')
			
 
				+    
			
 
				+    dump(clf, model_file_path)
			
 
				+
			
 
				+    # save score into global_result.csv file
			
 
				+    with open(cfg.global_result_filepath, "w") as f:
			
 
				+       f.write(model_filename + ';' + str(coeff) + ';\n')
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(description="Train model and saved it")
			
 
				+
			
 
				+    parser.add_argument('--data', type=str, help='Filename of dataset')
			
 
				+    parser.add_argument('--model', type=str, help='Kind of model expected', choices=cfg.kind_of_models)
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    param_data_file = args.data
			
 
				+    param_model = args.model
			
 
				+
			
 
				+    train(param_data_file, param_model)
			
 
				+
			
 
				+if __name__== "__main__":
			
 
				+    main()
		`@@ -0,0 +1 @@`
		`+10_column_8_row_7_SGD.joblib;0.9708138110211402;`