train_model.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. import sklearn.svm as svm
  6. from sklearn.utils import shuffle
  7. from sklearn.externals import joblib
  8. from sklearn.metrics import accuracy_score, f1_score
  9. from sklearn.model_selection import cross_val_score
  10. import numpy as np
  11. import pandas as pd
  12. import sys, os, getopt
  13. from modules.utils import config as cfg
  14. from modules import models as mdl
  15. saved_models_folder = cfg.saved_models_folder
  16. models_list = cfg.models_names_list
  17. current_dirpath = os.getcwd()
  18. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  19. def main():
  20. if len(sys.argv) <= 2:
  21. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  22. sys.exit(2)
  23. try:
  24. opts, args = getopt.getopt(sys.argv[1:], "hd:o:c", ["help=", "data=", "output=", "choice="])
  25. except getopt.GetoptError:
  26. # print help information and exit:
  27. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  28. sys.exit(2)
  29. for o, a in opts:
  30. if o == "-h":
  31. print('python train_model.py --data xxxx --output xxxx --choice svm_model')
  32. sys.exit()
  33. elif o in ("-d", "--data"):
  34. p_data_file = a
  35. elif o in ("-o", "--output"):
  36. p_output = a
  37. elif o in ("-c", "--choice"):
  38. p_choice = a
  39. if not p_choice in models_list:
  40. assert False, "Unknown model choice"
  41. else:
  42. assert False, "unhandled option"
  43. if not os.path.exists(output_model_folder):
  44. os.makedirs(output_model_folder)
  45. ########################
  46. # 1. Get and prepare data
  47. ########################
  48. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  49. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  50. # default first shuffle of data
  51. dataset_train = shuffle(dataset_train)
  52. dataset_test = shuffle(dataset_test)
  53. # get dataset with equal number of classes occurences
  54. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  55. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  56. nb_noisy_train = len(noisy_df_train.index)
  57. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  58. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  59. nb_noisy_test = len(noisy_df_test.index)
  60. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  61. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  62. # shuffle data another time
  63. final_df_train = shuffle(final_df_train)
  64. final_df_test = shuffle(final_df_test)
  65. final_df_train_size = len(final_df_train.index)
  66. final_df_test_size = len(final_df_test.index)
  67. # use of the whole data set for training
  68. x_dataset_train = final_df_train.ix[:,1:]
  69. x_dataset_test = final_df_test.ix[:,1:]
  70. y_dataset_train = final_df_train.ix[:,0]
  71. y_dataset_test = final_df_test.ix[:,0]
  72. #######################
  73. # 2. Construction of the model : Ensemble model structure
  74. #######################
  75. print("-------------------------------------------")
  76. print("Train dataset size: ", final_df_train_size)
  77. model = mdl.get_trained_model(p_choice, x_dataset_train, y_dataset_train)
  78. #######################
  79. # 3. Fit model : use of cross validation to fit model
  80. #######################
  81. val_scores = cross_val_score(model, x_dataset_train, y_dataset_train, cv=5)
  82. print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
  83. ######################
  84. # 4. Test : Validation and test dataset from .test dataset
  85. ######################
  86. # we need to specify validation size to 20% of whole dataset
  87. val_set_size = int(final_df_train_size/3)
  88. test_set_size = val_set_size
  89. total_validation_size = val_set_size + test_set_size
  90. if final_df_test_size > total_validation_size:
  91. x_dataset_test = x_dataset_test[0:total_validation_size]
  92. y_dataset_test = y_dataset_test[0:total_validation_size]
  93. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  94. y_test_model = model.predict(X_test)
  95. y_val_model = model.predict(X_val)
  96. val_accuracy = accuracy_score(y_val, y_val_model)
  97. test_accuracy = accuracy_score(y_test, y_test_model)
  98. val_f1 = f1_score(y_val, y_val_model)
  99. test_f1 = f1_score(y_test, y_test_model)
  100. ###################
  101. # 5. Output : Print and write all information in csv
  102. ###################
  103. print("Validation dataset size ", val_set_size)
  104. print("Validation: ", val_accuracy)
  105. print("Validation F1: ", val_f1)
  106. print("Test dataset size ", test_set_size)
  107. print("Test: ", val_accuracy)
  108. print("Test F1: ", test_f1)
  109. ##################
  110. # 6. Save model : create path if not exists
  111. ##################
  112. if not os.path.exists(saved_models_folder):
  113. os.makedirs(saved_models_folder)
  114. joblib.dump(model, output_model_folder + '/' + p_output + '.joblib')
  115. if __name__== "__main__":
  116. main()