From ed9e6b64cb5d015ebc9700cdac0b6139ea21ebda Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Wed, 1 May 2024 09:53:46 -0500
Subject: [PATCH] snapshot...

---
 modules/machine_learning/classification.py | 121 ++++++++++++---------
 1 file changed, 67 insertions(+), 54 deletions(-)

diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py
index 263ebc8c..c588ceb2 100644
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -15,6 +15,17 @@ import sklearn.tree as tree
 from sklearn.tree import export_graphviz
 
 
+def metrics(y_true, y_pred, y_pred_prob=None):
+    print(confusion_matrix(y_true, y_pred, labels=[1,0]))
+    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_true, y_pred)))
+    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_true, y_pred)))
+    print('Precision:   ', "{:.4f}".format(precision_score(y_true, y_pred)))
+    print('Recall:      ', "{:.4f}".format(recall_score(y_true, y_pred)))
+    print('F1:          ', "{:.4f}".format(f1_score(y_true, y_pred)))
+    if y_pred_prob is not None:
+        print('AUC:         ', "{:.4f}".format(roc_auc_score(y_true, y_pred_prob[:, 1])))
+
+
 def analyze(dataFrame):
     no_icing_df = dataFrame[dataFrame['icing_intensity'] == -1]
     icing_df = dataFrame[dataFrame['icing_intensity'] >= 1]
@@ -99,6 +110,7 @@ def get_feature_target_data(data_frame, standardize=True):
     print('num obs, features: ', x.shape)
     if standardize:
         x = preprocessing.StandardScaler().fit(x).transform(x)
+        x = np.where(np.isnan(x), 0, x)
 
     # The dependent variable (target) --------------------------------------------
     y = np.asarray(icing_df['icing_intensity'])
@@ -111,12 +123,8 @@ def get_feature_target_data(data_frame, standardize=True):
     return x, y
 
 
-def logistic_regression(x, y, x_test=None, y_test=None):
-    if x_test is None:
-        x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
-    else:
-        x_train = x
-        y_train = y
+def logistic_regression(x_train, y_train, x_test, y_test):
+
     print('Train set:', x_train.shape,  y_train.shape)
     print('Test set:', x_test.shape,  y_test.shape)
 
@@ -129,17 +137,19 @@ def logistic_regression(x, y, x_test=None, y_test=None):
     yhat = LR.predict(x_test)
     yhat_prob = LR.predict_proba(x_test)
 
-    print(confusion_matrix(y_test, yhat, labels=[1,0]))
-    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)
 
+    # print(confusion_matrix(y_test, yhat, labels=[1,0]))
+    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+
+
+def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
 
-def k_nearest_neighbors(x, y, k=4):
-    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
     print('Train set:', x_train.shape,  y_train.shape)
     print('Test set:', x_test.shape,  y_test.shape)
 
@@ -152,12 +162,14 @@ def k_nearest_neighbors(x, y, k=4):
     yhat = KN_C.predict(x_test)
     yhat_prob = KN_C.predict_proba(x_test)
 
-    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)
+
+    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 
 
 def k_nearest_neighbors_all(x, y, k_s=10):
@@ -178,12 +190,13 @@ def k_nearest_neighbors_all(x, y, k_s=10):
         KN_C = KNeighborsClassifier(n_neighbors=n).fit(x_train, y_train)
         yhat = KN_C.predict(x_test)
         yhat_prob = KN_C.predict_proba(x_test)
-        print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-        print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-        print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-        print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-        print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-        print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+        metrics(y_test, yhat, y_pred_prob=yhat_prob)
+        # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+        # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+        # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+        # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+        # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+        # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 
         mean_acc[n - 1] = accuracy_score(y_test, yhat)
         std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
@@ -201,7 +214,7 @@ def k_nearest_neighbors_all(x, y, k_s=10):
 
 
 def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
-    # x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
+
     print('Train set:', x_train.shape,  y_train.shape)
     print('Test set:', x_test.shape,  y_test.shape)
 
@@ -214,21 +227,22 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
     yhat = DT.predict(x_test)
     yhat_prob = DT.predict_proba(x_test)
 
-    print(confusion_matrix(y_test, yhat, labels=[1, 0]))
-    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)
+    # print(confusion_matrix(y_test, yhat, labels=[1, 0]))
+    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 
     return DT
 # export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'])
 # !dot -Tpng tree.dot -o tree.png
 
 
-def SVM(x, y, kernel='rbf'):
-    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
+def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
+
     print('Train set:', x_train.shape,  y_train.shape)
     print('Test set:', x_test.shape,  y_test.shape)
 
@@ -241,19 +255,17 @@ def SVM(x, y, kernel='rbf'):
     clf = clf.fit(x_train, y_train)
     yhat = clf.predict(x_test)
 
-    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    metrics(y_test, yhat)
 
+    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+
+
+def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
 
-def random_forest(x, y, x_test=None, y_test=None, criterion='entropy', max_depth=4):
-    if x_test is None:
-        x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
-    else:
-        x_train = x
-        y_train = y
     print('Train set:', x_train.shape,  y_train.shape)
     print('Test set:', x_test.shape,  y_test.shape)
 
@@ -266,9 +278,10 @@ def random_forest(x, y, x_test=None, y_test=None, criterion='entropy', max_depth
     yhat = rnd_clf.predict(x_test)
     yhat_prob = rnd_clf.predict_proba(x_test)
 
-    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)
+    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
+    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
+    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
+    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
-- 
GitLab