svm_model_train.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.utils import shuffle
  4. import sklearn.svm as svm
  5. from sklearn.externals import joblib
  6. import numpy as np
  7. import pandas as pd
  8. from sklearn.metrics import accuracy_score
  9. import sys, os, getopt
  10. output_model_folder = './saved_models/'
  11. def get_best_model(X_train, y_train):
  12. parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
  13. svc = svm.SVC(gamma="scale")
  14. clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
  15. clf.fit(X_train, y_train)
  16. model = clf.best_estimator_
  17. return model
  18. def main():
  19. if len(sys.argv) <= 1:
  20. print('Run with default parameters...')
  21. print('python svm_model_train.py --data xxxx --output xxxx')
  22. sys.exit(2)
  23. try:
  24. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  25. except getopt.GetoptError:
  26. # print help information and exit:
  27. print('python svm_model_train.py --data xxxx --output xxxx')
  28. sys.exit(2)
  29. for o, a in opts:
  30. if o == "-h":
  31. print('python svm_model_train.py --data xxxx --output xxxx')
  32. sys.exit()
  33. elif o in ("-d", "--data"):
  34. p_data_file = a
  35. elif o in ("-o", "--output"):
  36. p_output = a
  37. else:
  38. assert False, "unhandled option"
  39. if not os.path.exists(output_model_folder):
  40. os.makedirs(output_model_folder)
  41. dataset = pd.read_csv(p_data_file, header=None, sep=";")
  42. # default first shuffle of data
  43. dataset = shuffle(dataset)
  44. # get dataset with equal number of classes occurences
  45. noisy_df = dataset[dataset.ix[:, 0] == 1]
  46. not_noisy_df = dataset[dataset.ix[:, 0] == 0]
  47. nb_noisy = len(noisy_df.index)
  48. final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
  49. #final_df = pd.concat([not_noisy_df, noisy_df])
  50. # shuffle data another time
  51. final_df = shuffle(final_df)
  52. y_dataset = final_df.ix[:,0]
  53. x_dataset = final_df.ix[:,1:]
  54. X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0.4, random_state=42)
  55. svm_model = get_best_model(X_train, y_train)
  56. y_train_model = svm_model.predict(X_train)
  57. print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
  58. y_pred = svm_model.predict(X_test)
  59. print("**Test :** " + str(accuracy_score(y_test, y_pred)))
  60. joblib.dump(svm_model, output_model_folder + p_output + '.joblib')
  61. if __name__== "__main__":
  62. main()