From ed9e6b64cb5d015ebc9700cdac0b6139ea21ebda Mon Sep 17 00:00:00 2001 From: tomrink <rink@ssec.wisc.edu> Date: Wed, 1 May 2024 09:53:46 -0500 Subject: [PATCH] snapshot... --- modules/machine_learning/classification.py | 121 ++++++++++++--------- 1 file changed, 67 insertions(+), 54 deletions(-) diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 263ebc8c..c588ceb2 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -15,6 +15,17 @@ import sklearn.tree as tree from sklearn.tree import export_graphviz +def metrics(y_true, y_pred, y_pred_prob=None): + print(confusion_matrix(y_true, y_pred, labels=[1,0])) + print('Accuracy: ', "{:.4f}".format(accuracy_score(y_true, y_pred))) + print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_true, y_pred))) + print('Precision: ', "{:.4f}".format(precision_score(y_true, y_pred))) + print('Recall: ', "{:.4f}".format(recall_score(y_true, y_pred))) + print('F1: ', "{:.4f}".format(f1_score(y_true, y_pred))) + if y_pred_prob is not None: + print('AUC: ', "{:.4f}".format(roc_auc_score(y_true, y_pred_prob[:, 1]))) + + def analyze(dataFrame): no_icing_df = dataFrame[dataFrame['icing_intensity'] == -1] icing_df = dataFrame[dataFrame['icing_intensity'] >= 1] @@ -99,6 +110,7 @@ def get_feature_target_data(data_frame, standardize=True): print('num obs, features: ', x.shape) if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) + x = np.where(np.isnan(x), 0, x) # The dependent variable (target) -------------------------------------------- y = np.asarray(icing_df['icing_intensity']) @@ -111,12 +123,8 @@ def get_feature_target_data(data_frame, standardize=True): return x, y -def logistic_regression(x, y, x_test=None, y_test=None): - if x_test is None: - x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) - else: - x_train = x - y_train = y +def logistic_regression(x_train, y_train, x_test, y_test): + print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) @@ -129,17 +137,19 @@ def logistic_regression(x, y, x_test=None, y_test=None): yhat = LR.predict(x_test) yhat_prob = LR.predict_proba(x_test) - print(confusion_matrix(y_test, yhat, labels=[1,0])) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + metrics(y_test, yhat, y_pred_prob=yhat_prob) + # print(confusion_matrix(y_test, yhat, labels=[1,0])) + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + + +def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4): -def k_nearest_neighbors(x, y, k=4): - x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) @@ -152,12 +162,14 @@ def k_nearest_neighbors(x, y, k=4): yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + metrics(y_test, yhat, y_pred_prob=yhat_prob) + + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) def k_nearest_neighbors_all(x, y, k_s=10): @@ -178,12 +190,13 @@ def k_nearest_neighbors_all(x, y, k_s=10): KN_C = KNeighborsClassifier(n_neighbors=n).fit(x_train, y_train) yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + metrics(y_test, yhat, y_pred_prob=yhat_prob) + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) mean_acc[n - 1] = accuracy_score(y_test, yhat) std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0]) @@ -201,7 +214,7 @@ def k_nearest_neighbors_all(x, y, k_s=10): def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4): - # x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) + print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) @@ -214,21 +227,22 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep yhat = DT.predict(x_test) yhat_prob = DT.predict_proba(x_test) - print(confusion_matrix(y_test, yhat, labels=[1, 0])) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + metrics(y_test, yhat, y_pred_prob=yhat_prob) + # print(confusion_matrix(y_test, yhat, labels=[1, 0])) + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) return DT # export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp']) # !dot -Tpng tree.dot -o tree.png -def SVM(x, y, kernel='rbf'): - x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) +def SVM(x_train, y_train, x_test, y_test, kernel='rbf'): + print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) @@ -241,19 +255,17 @@ def SVM(x, y, kernel='rbf'): clf = clf.fit(x_train, y_train) yhat = clf.predict(x_test) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + metrics(y_test, yhat) + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + + +def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4): -def random_forest(x, y, x_test=None, y_test=None, criterion='entropy', max_depth=4): - if x_test is None: - x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) - else: - x_train = x - y_train = y print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) @@ -266,9 +278,10 @@ def random_forest(x, y, x_test=None, y_test=None, criterion='entropy', max_depth yhat = rnd_clf.predict(x_test) yhat_prob = rnd_clf.predict_proba(x_test) - print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + metrics(y_test, yhat, y_pred_prob=yhat_prob) + # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) + # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) + # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) + # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) + # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) + # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) -- GitLab