ensemble_model_train.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  5. import sklearn.svm as svm
  6. from sklearn.utils import shuffle
  7. from sklearn.externals import joblib
  8. import numpy as np
  9. import pandas as pd
  10. from sklearn.metrics import accuracy_score
  11. import sys, os, getopt
  12. current_dirpath = os.getcwd()
  13. output_model_folder = os.path.join(current_dirpath, 'saved_models')
  14. def get_best_model(X_train, y_train):
  15. Cs = [0.001, 0.01, 0.1, 1, 10]
  16. gammas = [0.001, 0.01, 0.1, 1]
  17. param_grid = {'kernel':['rbf'], 'C': Cs, 'gamma' : gammas}
  18. parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
  19. svc = svm.SVC(gamma="scale", probability=True)
  20. clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
  21. clf.fit(X_train, y_train)
  22. model = clf.best_estimator_
  23. return model
  24. def main():
  25. if len(sys.argv) <= 1:
  26. print('Run with default parameters...')
  27. print('python smv_model_train.py --data xxxx --output xxxx')
  28. sys.exit(2)
  29. try:
  30. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  31. except getopt.GetoptError:
  32. # print help information and exit:
  33. print('python smv_model_train.py --data xxxx --output xxxx')
  34. sys.exit(2)
  35. for o, a in opts:
  36. if o == "-h":
  37. print('python smv_model_train.py --data xxxx --output xxxx')
  38. sys.exit()
  39. elif o in ("-d", "--data"):
  40. p_data_file = a
  41. elif o in ("-o", "--output"):
  42. p_output = a
  43. else:
  44. assert False, "unhandled option"
  45. if not os.path.exists(output_model_folder):
  46. os.makedirs(output_model_folder)
  47. # get and split data
  48. dataset = pd.read_csv(p_data_file, header=None, sep=";")
  49. # default first shuffle of data
  50. dataset = shuffle(dataset)
  51. # get dataset with equal number of classes occurences
  52. noisy_df = dataset[dataset.ix[:, 0] == 1]
  53. not_noisy_df = dataset[dataset.ix[:, 0] == 0]
  54. nb_noisy = len(noisy_df.index)
  55. final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
  56. #final_df = pd.concat([not_noisy_df, noisy_df])
  57. # shuffle data another time
  58. final_df = shuffle(final_df)
  59. print(len(final_df.index))
  60. y_dataset = final_df.ix[:,0]
  61. x_dataset = final_df.ix[:,1:]
  62. X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0.4, random_state=42)
  63. svm_model = get_best_model(X_train, y_train)
  64. lr_model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1)
  65. rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
  66. ensemble_model = VotingClassifier(estimators=[
  67. ('svm', svm_model), ('lr', lr_model), ('rf', rf_model)],
  68. voting='soft', weights=[1,1,1])
  69. ensemble_model.fit(X_train, y_train)
  70. y_train_model = ensemble_model.predict(X_train)
  71. print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
  72. y_pred = ensemble_model.predict(X_test)
  73. print("**Test :** " + str(accuracy_score(y_test, y_pred)))
  74. joblib.dump(ensemble_model, output_model_folder + '/' + p_output + '.joblib')
  75. if __name__== "__main__":
  76. main()