svm_model_train.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. from sklearn.model_selection import train_test_split
  2. from sklearn.model_selection import GridSearchCV
  3. from sklearn.utils import shuffle
  4. import sklearn.svm as svm
  5. from sklearn.externals import joblib
  6. import numpy as np
  7. import pandas as pd
  8. from sklearn.metrics import accuracy_score
  9. import sys, os, getopt
  10. saved_models_folder = 'saved_models'
  11. current_dirpath = os.getcwd()
  12. output_model_folder = os.path.join(current_dirpath, saved_models_folder)
  13. def get_best_model(X_train, y_train):
  14. parameters = {'kernel':['rbf'], 'C': np.arange(1, 20)}
  15. svc = svm.SVC(gamma="scale")
  16. clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy', verbose=10)
  17. clf.fit(X_train, y_train)
  18. model = clf.best_estimator_
  19. return model
  20. def main():
  21. if len(sys.argv) <= 1:
  22. print('Run with default parameters...')
  23. print('python svm_model_train.py --data xxxx --output xxxx')
  24. sys.exit(2)
  25. try:
  26. opts, args = getopt.getopt(sys.argv[1:], "hd:o", ["help=", "data=", "output="])
  27. except getopt.GetoptError:
  28. # print help information and exit:
  29. print('python svm_model_train.py --data xxxx --output xxxx')
  30. sys.exit(2)
  31. for o, a in opts:
  32. if o == "-h":
  33. print('python svm_model_train.py --data xxxx --output xxxx')
  34. sys.exit()
  35. elif o in ("-d", "--data"):
  36. p_data_file = a
  37. elif o in ("-o", "--output"):
  38. p_output = a
  39. else:
  40. assert False, "unhandled option"
  41. if not os.path.exists(output_model_folder):
  42. os.makedirs(output_model_folder)
  43. dataset = pd.read_csv(p_data_file, header=None, sep=";")
  44. # default first shuffle of data
  45. dataset = shuffle(dataset)
  46. # get dataset with equal number of classes occurences
  47. noisy_df = dataset[dataset.ix[:, 0] == 1]
  48. not_noisy_df = dataset[dataset.ix[:, 0] == 0]
  49. nb_noisy = len(noisy_df.index)
  50. final_df = pd.concat([not_noisy_df[0:nb_noisy], noisy_df])
  51. #final_df = pd.concat([not_noisy_df, noisy_df])
  52. # shuffle data another time
  53. final_df = shuffle(final_df)
  54. y_dataset = final_df.ix[:,0]
  55. x_dataset = final_df.ix[:,1:]
  56. # use of the whole data set for training
  57. X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0., random_state=42)
  58. svm_model = get_best_model(X_train, y_train)
  59. y_train_model = svm_model.predict(X_train)
  60. print("**Train :** " + str(accuracy_score(y_train, y_train_model)))
  61. #y_pred = svm_model.predict(X_test)
  62. #print("**Test :** " + str(accuracy_score(y_test, y_pred)))
  63. # create path if not exists
  64. if not os.path.exists(saved_models_folder):
  65. os.makedirs(saved_models_folder)
  66. joblib.dump(svm_model, output_model_folder + '/' + p_output + '.joblib')
  67. if __name__== "__main__":
  68. main()