train_model.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. import sklearn.svm as svm
  6. from sklearn.utils import shuffle
  7. from sklearn.externals import joblib
  8. from sklearn.metrics import accuracy_score, f1_score
  9. from sklearn.model_selection import cross_val_score
  10. import numpy as np
  11. import pandas as pd
  12. import sys, os, getopt
  13. from modules.utils import config as cfg
  14. from modules import models as mdl
  15. saved_models_folder = cfg.saved_models_folder
  16. models_list = cfg.models_names_list
  17. current_dirpath = os.getcwd()
  18. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  19. def main():
  20. # TODO : use argparse
  21. if len(sys.argv) <= 2:
  22. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  23. sys.exit(2)
  24. try:
  25. opts, args = getopt.getopt(sys.argv[1:], "hd:o:c", ["help=", "data=", "output=", "choice="])
  26. except getopt.GetoptError:
  27. # print help information and exit:
  28. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  29. sys.exit(2)
  30. for o, a in opts:
  31. if o == "-h":
  32. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  33. sys.exit()
  34. elif o in ("-d", "--data"):
  35. p_data_file = a
  36. elif o in ("-o", "--output"):
  37. p_output = a
  38. elif o in ("-c", "--choice"):
  39. p_choice = a
  40. if not p_choice in models_list:
  41. assert False, "Unknown model choice"
  42. else:
  43. assert False, "unhandled option"
  44. if not os.path.exists(output_model_folder):
  45. os.makedirs(output_model_folder)
  46. ########################
  47. # 1. Get and prepare data
  48. ########################
  49. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  50. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  51. # default first shuffle of data
  52. dataset_train = shuffle(dataset_train)
  53. dataset_test = shuffle(dataset_test)
  54. # get dataset with equal number of classes occurences
  55. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  56. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  57. nb_noisy_train = len(noisy_df_train.index)
  58. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  59. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  60. nb_noisy_test = len(noisy_df_test.index)
  61. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  62. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  63. # shuffle data another time
  64. final_df_train = shuffle(final_df_train)
  65. final_df_test = shuffle(final_df_test)
  66. final_df_train_size = len(final_df_train.index)
  67. final_df_test_size = len(final_df_test.index)
  68. # use of the whole data set for training
  69. x_dataset_train = final_df_train.ix[:,1:]
  70. x_dataset_test = final_df_test.ix[:,1:]
  71. y_dataset_train = final_df_train.ix[:,0]
  72. y_dataset_test = final_df_test.ix[:,0]
  73. #######################
  74. # 2. Construction of the model : Ensemble model structure
  75. #######################
  76. print("-------------------------------------------")
  77. print("Train dataset size: ", final_df_train_size)
  78. model = mdl.get_trained_model(p_choice, x_dataset_train, y_dataset_train)
  79. #######################
  80. # 3. Fit model : use of cross validation to fit model
  81. #######################
  82. val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  83. print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
  84. ######################
  85. # 4. Test : Validation and test dataset from .test dataset
  86. ######################
  87. # we need to specify validation size to 20% of whole dataset
  88. val_set_size = int(final_df_train_size/3)
  89. test_set_size = val_set_size
  90. total_validation_size = val_set_size + test_set_size
  91. if final_df_test_size > total_validation_size:
  92. x_dataset_test = x_dataset_test[0:total_validation_size]
  93. y_dataset_test = y_dataset_test[0:total_validation_size]
  94. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  95. y_test_model = model.predict(X_test)
  96. y_val_model = model.predict(X_val)
  97. val_accuracy = accuracy_score(y_val, y_val_model)
  98. test_accuracy = accuracy_score(y_test, y_test_model)
  99. val_f1 = f1_score(y_val, y_val_model)
  100. test_f1 = f1_score(y_test, y_test_model)
  101. ###################
  102. # 5. Output : Print and write all information in csv
  103. ###################
  104. print("Validation dataset size ", val_set_size)
  105. print("Validation: ", val_accuracy)
  106. print("Validation F1: ", val_f1)
  107. print("Test dataset size ", test_set_size)
  108. print("Test: ", val_accuracy)
  109. print("Test F1: ", test_f1)
  110. ##################
  111. # 6. Save model : create path if not exists
  112. ##################
  113. if not os.path.exists(saved_models_folder):
  114. os.makedirs(saved_models_folder)
  115. joblib.dump(model, output_model_folder + '/' + p_output + '.joblib')
  116. if __name__== "__main__":
  117. main()