svm_model_train.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.utils import shuffle
  4. import sklearn.svm as svm
  5. from sklearn.externals import joblib
  6. import numpy as np
  7. import pandas as pd
  8. from sklearn.metrics import accuracy_score
  9. import sys, os, getopt
  10. current_dirpath = os.getcwd()
  11. output_model_folder = os.path.join(current_dirpath, 'saved_models')
  12. def get_best_model(X_train, y_train):
  13. parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
  14. svc = svm.SVC(gamma="scale")
  15. clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
  16. clf.fit(X_train, y_train)
  17. model = clf.best_estimator_
  18. return model
  19. def main():
  20. if len(sys.argv) <= 1:
  21. print('Run with default parameters...')
  22. print('python svm_model_train.py --data xxxx --output xxxx')
  23. sys.exit(2)
  24. try:
  25. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  26. except getopt.GetoptError:
  27. # print help information and exit:
  28. print('python svm_model_train.py --data xxxx --output xxxx')
  29. sys.exit(2)
  30. for o, a in opts:
  31. if o == "-h":
  32. print('python svm_model_train.py --data xxxx --output xxxx')
  33. sys.exit()
  34. elif o in ("-d", "--data"):
  35. p_data_file = a
  36. elif o in ("-o", "--output"):
  37. p_output = a
  38. else:
  39. assert False, "unhandled option"
  40. if not os.path.exists(output_model_folder):
  41. os.makedirs(output_model_folder)
  42. dataset = pd.read_csv(p_data_file, header=None, sep=";")
  43. # default first shuffle of data
  44. dataset = shuffle(dataset)
  45. # get dataset with equal number of classes occurences
  46. noisy_df = dataset[dataset.ix[:, 0] == 1]
  47. not_noisy_df = dataset[dataset.ix[:, 0] == 0]
  48. nb_noisy = len(noisy_df.index)
  49. final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
  50. #final_df = pd.concat([not_noisy_df, noisy_df])
  51. # shuffle data another time
  52. final_df = shuffle(final_df)
  53. y_dataset = final_df.ix[:,0]
  54. x_dataset = final_df.ix[:,1:]
  55. X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0.4, random_state=42)
  56. svm_model = get_best_model(X_train, y_train)
  57. y_train_model = svm_model.predict(X_train)
  58. print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
  59. y_pred = svm_model.predict(X_test)
  60. print("**Test :** " + str(accuracy_score(y_test, y_pred)))
  61. joblib.dump(svm_model, output_model_folder + '/' + p_output + '.joblib')
  62. if __name__== "__main__":
  63. main()