# main imports import sys, os import argparse import json import numpy as np import pandas as pd # models imports from keras.preprocessing.image import ImageDataGenerator from keras.models import Sequential from keras.layers import Conv1D, MaxPooling1D from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization from keras.wrappers.scikit_learn import KerasClassifier from keras import backend as K from sklearn.utils import shuffle from sklearn.metrics import roc_auc_score # modules and config imports import custom_config as cfg def f1(y_true, y_pred): def recall(y_true, y_pred): """Recall metric. Only computes a batch-wise average of recall. Computes the recall, a metric for multi-label classification of how many relevant items are selected. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (possible_positives + K.epsilon()) return recall def precision(y_true, y_pred): """Precision metric. Only computes a batch-wise average of precision. Computes the precision, a metric for multi-label classification of how many selected items are relevant. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision precision = precision(y_true, y_pred) recall = recall(y_true, y_pred) return 2*((precision*recall)/(precision+recall+K.epsilon())) def generate_model(input_shape): model = Sequential() #model.add(Conv1D(128, (10), input_shape=input_shape)) #model.add(Activation('relu')) #model.add(Conv1D(128, (10))) #model.add(Activation('relu')) #model.add(Conv1D(128, (10))) #model.add(Activation('relu')) #model.add(MaxPooling1D(pool_size=(2))) #model.add(Conv1D(64, (10))) #model.add(Activation('relu')) #model.add(Conv1D(64, (10))) #model.add(Activation('relu')) #model.add(Conv1D(64, (10))) #model.add(Activation('relu')) #model.add(MaxPooling1D(pool_size=(2))) #model.add(Conv1D(32, (10))) #model.add(Activation('relu')) #model.add(Conv1D(32, (10))) #model.add(Activation('relu')) #model.add(Conv1D(32, (10))) #model.add(Activation('relu')) #model.add(MaxPooling1D(pool_size=(2))) model.add(Flatten(input_shape=input_shape)) model.add(Dense(2048)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(1024)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(512)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.3)) model.add(Dense(256)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.3)) model.add(Dense(128)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.3)) model.add(Dense(20)) model.add(Activation('relu')) model.add(BatchNormalization()) model.add(Dropout(0.3)) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) return model def main(): parser = argparse.ArgumentParser(description="Process deep_network_keras_svd.py parameters") parser.add_argument('--data', type=str, help='Data filename prefix to access train and test dataset') parser.add_argument('--output', type=str, help='Name of filename to save model into') parser.add_argument('--size', type=int, help='Size of input data vector') args = parser.parse_args() p_datafile = args.data p_output_filename = args.output p_vector_size = args.size epochs = 10 batch_size = cfg.keras_batch input_shape = (p_vector_size, 1) ########################### # 1. Get and prepare data ########################### dataset_train = pd.read_csv(p_datafile + '.train', header=None, sep=";") dataset_test = pd.read_csv(p_datafile + '.test', header=None, sep=";") # default first shuffle of data dataset_train = shuffle(dataset_train) dataset_test = shuffle(dataset_test) # get dataset with equal number of classes occurences noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 1] not_noisy_df_train = dataset_train[dataset_train.ix[:, 0] == 0] nb_noisy_train = len(noisy_df_train.index) noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 1] not_noisy_df_test = dataset_test[dataset_test.ix[:, 0] == 0] nb_noisy_test = len(noisy_df_test.index) final_df_train = pd.concat([not_noisy_df_train[0:nb_noisy_train], noisy_df_train]) final_df_test = pd.concat([not_noisy_df_test[0:nb_noisy_test], noisy_df_test]) # shuffle data another time final_df_train = shuffle(final_df_train) final_df_test = shuffle(final_df_test) final_df_train_size = len(final_df_train.index) final_df_test_size = len(final_df_test.index) # use of the whole data set for training x_dataset_train = final_df_train.ix[:,1:] x_dataset_test = final_df_test.ix[:,1:] y_dataset_train = final_df_train.ix[:,0] y_dataset_test = final_df_test.ix[:,0] ####################### # 2. Getting model ####################### model = generate_model(input_shape) model.summary() #model = KerasClassifier(build_fn=model, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch, verbose=0) ####################### # 3. Fit model : use of cross validation to fit model ####################### # reshape input data x_dataset_train = np.array(x_dataset_train).reshape(len(x_dataset_train), p_vector_size, 1) x_dataset_test = np.array(x_dataset_test).reshape(len(x_dataset_test), p_vector_size, 1) model.fit(x_dataset_train, y_dataset_train, validation_split=0.20, epochs=cfg.keras_epochs, batch_size=cfg.keras_batch) score = model.evaluate(x_dataset_test, y_dataset_test, batch_size=batch_size) if not os.path.exists(cfg.saved_models_folder): os.makedirs(cfg.saved_models_folder) # save the model into HDF5 file model_output_path = os.path.join(cfg.saved_models_folder, p_output_filename + '.json') json_model_content = model.to_json() with open(model_output_path, 'w') as f: print("Model saved into ", model_output_path) json.dump(json_model_content, f, indent=4) model.save_weights(model_output_path.replace('.json', '.h5')) # Save results obtained from model y_test_prediction = model.predict(x_dataset_test) print("Metrics : ", model.metrics_names) print("Prediction : ", score) print("ROC AUC : ", roc_auc_score(y_dataset_test, y_test_prediction)) if __name__== "__main__": main()