train_model.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. import sklearn.svm as svm
  6. from sklearn.utils import shuffle
  7. from sklearn.externals import joblib
  8. from sklearn.metrics import accuracy_score, f1_score
  9. from sklearn.model_selection import cross_val_score
  10. import numpy as np
  11. import pandas as pd
  12. import sys, os, argparse
  13. from modules.utils import config as cfg
  14. from modules import models as mdl
  15. saved_models_folder = cfg.saved_models_folder
  16. models_list = cfg.models_names_list
  17. current_dirpath = os.getcwd()
  18. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  19. def main():
  20. parser = argparse.ArgumentParser(description="Train SKLearn model and save it into .joblib file")
  21. parser.add_argument('--data', type=str, help='dataset filename prefix (without .train and .test)')
  22. parser.add_argument('--output', type=str, help='output file name desired for model (without .joblib extension)')
  23. parser.add_argument('--choice', type=str, help='model choice from list of choices', choices=models_list)
  24. args = parser.parse_args()
  25. p_data_file = args.data
  26. p_output = args.output
  27. p_choice = args.choice
  28. if not os.path.exists(output_model_folder):
  29. os.makedirs(output_model_folder)
  30. ########################
  31. # 1. Get and prepare data
  32. ########################
  33. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  34. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  35. # default first shuffle of data
  36. dataset_train = shuffle(dataset_train)
  37. dataset_test = shuffle(dataset_test)
  38. # get dataset with equal number of classes occurences
  39. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  40. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  41. nb_noisy_train = len(noisy_df_train.index)
  42. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  43. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  44. nb_noisy_test = len(noisy_df_test.index)
  45. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  46. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  47. # shuffle data another time
  48. final_df_train = shuffle(final_df_train)
  49. final_df_test = shuffle(final_df_test)
  50. final_df_train_size = len(final_df_train.index)
  51. final_df_test_size = len(final_df_test.index)
  52. # use of the whole data set for training
  53. x_dataset_train = final_df_train.ix[:,1:]
  54. x_dataset_test = final_df_test.ix[:,1:]
  55. y_dataset_train = final_df_train.ix[:,0]
  56. y_dataset_test = final_df_test.ix[:,0]
  57. #######################
  58. # 2. Construction of the model : Ensemble model structure
  59. #######################
  60. print("-------------------------------------------")
  61. print("Train dataset size: ", final_df_train_size)
  62. model = mdl.get_trained_model(p_choice, x_dataset_train, y_dataset_train)
  63. #######################
  64. # 3. Fit model : use of cross validation to fit model
  65. #######################
  66. val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  67. print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
  68. ######################
  69. # 4. Test : Validation and test dataset from .test dataset
  70. ######################
  71. # we need to specify validation size to 20% of whole dataset
  72. val_set_size = int(final_df_train_size/3)
  73. test_set_size = val_set_size
  74. total_validation_size = val_set_size + test_set_size
  75. if final_df_test_size > total_validation_size:
  76. x_dataset_test = x_dataset_test[0:total_validation_size]
  77. y_dataset_test = y_dataset_test[0:total_validation_size]
  78. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  79. y_test_model = model.predict(X_test)
  80. y_val_model = model.predict(X_val)
  81. val_accuracy = accuracy_score(y_val, y_val_model)
  82. test_accuracy = accuracy_score(y_test, y_test_model)
  83. val_f1 = f1_score(y_val, y_val_model)
  84. test_f1 = f1_score(y_test, y_test_model)
  85. ###################
  86. # 5. Output : Print and write all information in csv
  87. ###################
  88. print("Validation dataset size ", val_set_size)
  89. print("Validation: ", val_accuracy)
  90. print("Validation F1: ", val_f1)
  91. print("Test dataset size ", test_set_size)
  92. print("Test: ", val_accuracy)
  93. print("Test F1: ", test_f1)
  94. ##################
  95. # 6. Save model : create path if not exists
  96. ##################
  97. if not os.path.exists(saved_models_folder):
  98. os.makedirs(saved_models_folder)
  99. joblib.dump(model, output_model_folder + '/' + p_output + '.joblib')
  100. if __name__== "__main__":
  101. main()