ensemble_model_train_v2.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. from sklearn.neighbors import KNeighborsClassifier
  6. from sklearn.ensemble import GradientBoostingClassifier
  7. import sklearn.svm as svm
  8. from sklearn.utils import shuffle
  9. from sklearn.externals import joblib
  10. import numpy as np
  11. import pandas as pd
  12. from sklearn.metrics import accuracy_score
  13. import sys, os, getopt
  14. saved_models_folder = 'saved_models'
  15. current_dirpath = os.getcwd()
  16. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  17. def get_best_model(X_train, y_train):
  18. Cs = [0.001, 0.01, 0.1, 1, 10, 20, 30]
  19. gammas = [0.001, 0.01, 0.1, 1, 5, 10]
  20. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  21. parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
  22. svc = svm.SVC(gamma="scale", probability=True, max_iter=10000)
  23. clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
  24. clf.fit(X_train, y_train)
  25. model = clf.best_estimator_
  26. return model
  27. def main():
  28. if len(sys.argv) <= 1:
  29. print('Run with default parameters...')
  30. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  31. sys.exit(2)
  32. try:
  33. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  34. except getopt.GetoptError:
  35. # print help information and exit:
  36. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  37. sys.exit(2)
  38. for o, a in opts:
  39. if o == "-h":
  40. print('python ensemble_model_train_v2.py --data xxxx --output xxxx')
  41. sys.exit()
  42. elif o in ("-d", "--data"):
  43. p_data_file = a
  44. elif o in ("-o", "--output"):
  45. p_output = a
  46. else:
  47. assert False, "unhandled option"
  48. if not os.path.exists(output_model_folder):
  49. os.makedirs(output_model_folder)
  50. # get and split data
  51. dataset = pd.read_csv(p_data_file, header=None, sep=";")
  52. # default first shuffle of data
  53. dataset = shuffle(dataset)
  54. # get dataset with equal number of classes occurences
  55. noisy_df = dataset[dataset.ix[:, 0] == 1]
  56. not_noisy_df = dataset[dataset.ix[:, 0] == 0]
  57. nb_noisy = len(noisy_df.index)
  58. final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df[:]])
  59. #final_df = pd.concat([not_noisy_df, noisy_df])
  60. # shuffle data another time
  61. final_df = shuffle(final_df)
  62. print(len(final_df.index))
  63. y_dataset = final_df.ix[:,0]
  64. x_dataset = final_df.ix[:,1:]
  65. X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0.5, random_state=42)
  66. svm_model = get_best_model(X_train, y_train)
  67. knc_model = KNeighborsClassifier(n_neighbors=2)
  68. gbc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
  69. lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
  70. rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
  71. ensemble_model = VotingClassifier(estimators=[
  72. ('lr', lr_model),
  73. ('knc', knc_model),
  74. ('gbc', gbc_model),
  75. ('svm', svm_model),
  76. ('rf', rf_model)],
  77. voting='soft', weights=[1, 1, 1, 1, 1])
  78. ensemble_model.fit(X_train, y_train)
  79. y_train_model = ensemble_model.predict(X_train)
  80. print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
  81. y_pred = ensemble_model.predict(X_test)
  82. print("**Test :** " + str(accuracy_score(y_test, y_pred)))
  83. # create path if not exists
  84. if not os.path.exists(saved_models_folder):
  85. os.makedirs(saved_models_folder)
  86. joblib.dump(ensemble_model, output_model_folder + '/' + p_output + '.joblib')
  87. if __name__== "__main__":
  88. main()