ensemble_model_train_v2.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. from sklearn.neighbors import KNeighborsClassifier
  6. from sklearn.ensemble import GradientBoostingClassifier
  7. import sklearn.svm as svm
  8. from sklearn.utils import shuffle
  9. from sklearn.externals import joblib
  10. from sklearn.metrics import accuracy_score, f1_score
  11. from sklearn.model_selection import cross_val_score
  12. import numpy as np
  13. import pandas as pd
  14. import sys, os, getopt
  15. saved_models_folder = 'saved_models'
  16. current_dirpath = os.getcwd()
  17. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  18. def get_best_model(X_train, y_train):
  19. Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
  20. gammas = [0.001, 0.01, 0.1, 1, 5, 10, 100]
  21. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  22. svc = svm.SVC(probability=True)
  23. clf = GridSearchCV(svc, param_grid, cv=10, scoring='accuracy', verbose=10)
  24. clf.fit(X_train, y_train)
  25. model = clf.best_estimator_
  26. return model
  27. def main():
  28. if len(sys.argv) <= 1:
  29. print('Run with default parameters...')
  30. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  31. sys.exit(2)
  32. try:
  33. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  34. except getopt.GetoptError:
  35. # print help information and exit:
  36. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  37. sys.exit(2)
  38. for o, a in opts:
  39. if o == "-h":
  40. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  41. sys.exit()
  42. elif o in ("-d", "--data"):
  43. p_data_file = a
  44. elif o in ("-o", "--output"):
  45. p_output = a
  46. else:
  47. assert False, "unhandled option"
  48. if not os.path.exists(output_model_folder):
  49. os.makedirs(output_model_folder)
  50. # 1. Get and prepare data
  51. dataset_train = pd.read_csv(p_data_file + '.train', header=None, sep=";")
  52. dataset_test = pd.read_csv(p_data_file + '.test', header=None, sep=";")
  53. # default first shuffle of data
  54. dataset_train = shuffle(dataset_train)
  55. dataset_test = shuffle(dataset_test)
  56. # get dataset with equal number of classes occurences
  57. noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1]
  58. not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0]
  59. nb_noisy_train = len(noisy_df_train.index)
  60. noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1]
  61. not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0]
  62. nb_noisy_test = len(noisy_df_test.index)
  63. final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train])
  64. final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test])
  65. # shuffle data another time
  66. final_df_train = shuffle(final_df_train)
  67. final_df_test = shuffle(final_df_test)
  68. final_df_train_size = len(final_df_train.index)
  69. final_df_test_size = len(final_df_test.index)
  70. # use of the whole data set for training
  71. x_dataset_train = final_df_train.ix[:,1:]
  72. x_dataset_test = final_df_test.ix[:,1:]
  73. y_dataset_train = final_df_train.ix[:,0]
  74. y_dataset_test = final_df_test.ix[:,0]
  75. #######################
  76. # 2. Construction of the model : Ensemble model structure
  77. #######################
  78. svm_model = get_best_model(y_dataset_train, y_dataset_train)
  79. knc_model = KNeighborsClassifier(n_neighbors=2)
  80. gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
  81. lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
  82. rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
  83. ensemble_model = VotingClassifier(estimators=[
  84. ('lr', lr_model),
  85. ('knc', knc_model),
  86. ('gbc', gbc_model),
  87. ('svm', svm_model),
  88. ('rf', rf_model)],
  89. voting='soft', weights=[1, 1, 1, 1, 1])
  90. #######################
  91. # 3. Fit model : use of cross validation to fit model
  92. #######################
  93. print("-------------------------------------------")
  94. print("Train dataset size: ", final_df_train_size)
  95. ensemble_model.fit(x_dataset_train, y_dataset_train)
  96. val_scores = cross_val_score(ensemble_model, x_dataset_train, y_dataset_train, cv=5)
  97. print("Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
  98. ######################
  99. # 4. Test : Validation and test dataset from .test dataset
  100. ######################
  101. # we need to specify validation size to 20% of whole dataset
  102. val_set_size = int(final_df_train_size/3)
  103. test_set_size = val_set_size
  104. total_validation_size = val_set_size + test_set_size
  105. if final_df_test_size > total_validation_size:
  106. x_dataset_test = x_dataset_test[0:total_validation_size]
  107. y_dataset_test = y_dataset_test[0:total_validation_size]
  108. X_test, X_val, y_test, y_val = train_test_split(x_dataset_test, y_dataset_test, test_size=0.5, random_state=1)
  109. y_test_model = ensemble_model.predict(X_test)
  110. y_val_model = ensemble_model.predict(X_val)
  111. val_accuracy = accuracy_score(y_val, y_val_model)
  112. test_accuracy = accuracy_score(y_test, y_test_model)
  113. val_f1 = f1_score(y_val, y_val_model)
  114. test_f1 = f1_score(y_test, y_test_model)
  115. ###################
  116. # 5. Output : Print and write all information in csv
  117. ###################
  118. print("Validation dataset size ", val_set_size)
  119. print("Validation: ", val_accuracy)
  120. print("Validation F1: ", val_f1)
  121. print("Test dataset size ", test_set_size)
  122. print("Test: ", val_accuracy)
  123. print("Test F1: ", test_f1)
  124. ##################
  125. # 6. Save model : create path if not exists
  126. ##################
  127. # create path if not exists
  128. if not os.path.exists(saved_models_folder):
  129. os.makedirs(saved_models_folder)
  130. joblib.dump(ensemble_model, output_model_folder + '/' + p_output + '.joblib')
  131. if __name__== "__main__":
  132. main()