Parcourir la source

add balanced data into SVC

Jérôme BUISINE il y a 3 ans
Parent
commit
b73b27ab44
5 fichiers modifiés avec 15 ajouts et 14 suppressions
  1. 5 4
      find_best_attributes.py
  2. 4 4
      find_best_filters.py
  3. 2 2
      models.py
  4. 2 2
      train_model_attributes.py
  5. 2 2
      train_model_filters.py

+ 5 - 4
find_best_attributes.py

@@ -69,14 +69,15 @@ def loadDataset(filename):
     # get dataset with equal number of classes occurences
     noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
     not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
-    nb_noisy_train = len(noisy_df_train.index)
+    #nb_noisy_train = len(noisy_df_train.index)
 
     noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
     not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
-    nb_noisy_test = len(noisy_df_test.index)
+    #nb_noisy_test = len(noisy_df_test.index)
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+    # use of all data
+    final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)

+ 4 - 4
find_best_filters.py

@@ -68,14 +68,14 @@ def loadDataset(filename):
     # get dataset with equal number of classes occurences
     noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 1]
     not_noisy_df_train = dataset_train[dataset_train.iloc[:, 0] == 0]
-    nb_noisy_train = len(noisy_df_train.index)
+    #nb_noisy_train = len(noisy_df_train.index)
 
     noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 1]
     not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
-    nb_noisy_test = len(noisy_df_test.index)
+    #nb_noisy_test = len(noisy_df_test.index)
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+    final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)

+ 2 - 2
models.py

@@ -27,7 +27,7 @@ def _get_best_model(X_train, y_train):
     gammas = [0.001, 0.1, 1, 10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
 
-    svc = svm.SVC(probability=True)
+    svc = svm.SVC(probability=True, class_weight='balanced')
     clf = GridSearchCV(svc, param_grid, cv=10, verbose=1, scoring=my_accuracy_scorer)
 
     clf.fit(X_train, y_train)
@@ -47,7 +47,7 @@ def _get_best_gpu_model(X_train, y_train):
     gammas = [0.001, 0.01, 0.1, 1, 2, 5, 10, 100]
     param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
 
-    svc = SVC(probability=True)
+    svc = SVC(probability=True, class_weight='balanced')
     clf = GridSearchCV(svc, param_grid, cv=10, verbose=1, scoring=my_accuracy_scorer)
 
     clf.fit(X_train, y_train)

+ 2 - 2
train_model_attributes.py

@@ -67,8 +67,8 @@ def main():
     not_noisy_df_test = dataset_test[dataset_test.iloc[:, 0] == 0]
     nb_noisy_test = len(noisy_df_test.index)
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+    final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)

+ 2 - 2
train_model_filters.py

@@ -67,8 +67,8 @@ def main():
     not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
     nb_noisy_test = len(noisy_df_test.index)
 
-    final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
-    final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
+    final_df_train = pd.concat([not_noisy_df_train, noisy_df_train])
+    final_df_test = pd.concat([not_noisy_df_test, noisy_df_test])
 
     # shuffle data another time
     final_df_train = shuffle(final_df_train)