From 2c4f9cf8036ec95d17ec66c79d3deb6e10823544 Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Thu, 2 May 2024 09:52:02 -0500
Subject: [PATCH] snapshot...

---
 modules/machine_learning/classification.py | 84 +++++-----------------
 1 file changed, 18 insertions(+), 66 deletions(-)

diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py
index 79813e32..d5fedef2 100644
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
 import itertools
 import sklearn.tree as tree
 from sklearn.tree import export_graphviz
 
+# The independent variables (features) we want to use:
+params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
+          'cld_opd_dcomp', 'cld_cwp_dcomp']
+
 
 def metrics(y_true, y_pred, y_pred_prob=None):
     print(confusion_matrix(y_true, y_pred, labels=[1,0]))
@@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes,
     plt.xlabel('Predicted label')
 
 
-def get_csv_as_dataframe(csv_file, reduce_frac=1.0, random_state=42):
+def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
     icing_df = pd.read_csv(csv_file)
 
     # Random selection of reduce_frac of the rows
     icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
 
-    # # remove approximately half of rows where column_name equals to column_value
-    # column_name = 'icing_intensity'
-    # column_value = -1
-    # if column_name in icing_df.columns:
-    #     df_to_reduce = icing_df[icing_df[column_name] == column_value]
-    #     icing_df = icing_df[icing_df[column_name] != column_value]
-    #
-    #     if reduce_frac is not None:
-    #         df_to_reduce = df_to_reduce.sample(axis=0, frac=0.5, random_state=random_state)
-    #
-    #     icing_df = pd.concat([icing_df, df_to_reduce])
-
-    return icing_df
-
-
-def get_feature_target_data(data_frame, standardize=True):
-    icing_df = data_frame
-
     # Remove these, more than half seem to be NaN
     icing_df = icing_df.drop('lwc_dcomp', axis=1)
     icing_df = icing_df.drop('iwc_dcomp', axis=1)
@@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True):
     # Remove this column for now.
     icing_df = icing_df.drop('cld_geo_thick', axis=1)
 
-    # The independent variables (features) we want to use:
-    params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
-              'cld_opd_dcomp', 'cld_cwp_dcomp']
-
     # Remove rows with NaN values
     # icing_df = icing_df.dropna()
 
@@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test):
 
     metrics(y_test, yhat, y_pred_prob=yhat_prob)
 
-    # print(confusion_matrix(y_test, yhat, labels=[1,0]))
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
-
 
 def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
 
@@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
 
     metrics(y_test, yhat, y_pred_prob=yhat_prob)
 
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
-
 
 def k_nearest_neighbors_all(x, y, k_s=10):
     x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
@@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
     print('num no icing test: ', np.sum(y_test == 0))
     print('num icing test: ', np.sum(y_test == 1))
 
-    k_s = 10
     mean_acc = np.zeros((k_s - 1))
     std_acc = np.zeros((k_s - 1))
 
@@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
         yhat = KN_C.predict(x_test)
         yhat_prob = KN_C.predict_proba(x_test)
         metrics(y_test, yhat, y_pred_prob=yhat_prob)
-        # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-        # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-        # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-        # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-        # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-        # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 
         mean_acc[n - 1] = accuracy_score(y_test, yhat)
         std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
@@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
     yhat_prob = DT.predict_proba(x_test)
 
     metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print(confusion_matrix(y_test, yhat, labels=[1, 0]))
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 
     return DT
-# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'])
+# Use this to plot the tree  -----------------------------------------------------------
+# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=params)
 # !dot -Tpng tree.dot -o tree.png
 
 
@@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
 
     metrics(y_test, yhat)
 
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-
 
 def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
 
@@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
     yhat_prob = rnd_clf.predict_proba(x_test)
 
     metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+
+
+def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):
+
+    gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
+    gbm.fit(x_train, y_train)
+    yhat = gbm.predict(x_test)
+    yhat_prob = gbm.predict_proba(x_test)
+
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)
-- 
GitLab