From 99573c5494efcd10b07ab2a6b0e90b40623cecbd Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Thu, 16 May 2024 09:48:56 -0500
Subject: [PATCH] snapshot...

---
 modules/machine_learning/classification.py | 27 ++++++++++++++--------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py
index 8b8cfcc6..18040b7a 100644
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -15,8 +15,14 @@ import sklearn.tree as tree
 from sklearn.tree import export_graphviz
 
 # The independent variables (features) we want to use:
-# params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp']
-params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp']
+# feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp']
+# feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp']
+feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_acha', 'cld_opd_acha']
+
+# The dependent variable (target)
+target_param = 'icing_intensity'
+
+params = feature_params + [target_param]
 
 
 def metrics(y_true, y_pred, y_pred_prob=None):
@@ -74,15 +80,16 @@ def plot_confusion_matrix(cm, classes,
 def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False):
     icing_df = pd.read_csv(csv_file)
 
+    icing_df = icing_df[params]
+
     # Random selection of reduce_frac of the rows
     icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
 
-    # Remove these, more than half seem to be NaN
-    icing_df = icing_df.drop('lwc_dcomp', axis=1)
-    icing_df = icing_df.drop('iwc_dcomp', axis=1)
-
-    # Remove this column for now.
-    icing_df = icing_df.drop('cld_geo_thick', axis=1)
+    # # Remove these, more than half seem to be NaN
+    # icing_df = icing_df.drop('lwc_dcomp', axis=1)
+    # icing_df = icing_df.drop('iwc_dcomp', axis=1)
+    # # Remove this column for now.
+    # icing_df = icing_df.drop('cld_geo_thick', axis=1)
 
     print('num obs, features: ', icing_df.shape)
     if remove_nan:
@@ -91,7 +98,7 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard
 
     # icing_df = icing_df[icing_df.cld_temp_acha < 273.10]
 
-    x = np.asarray(icing_df[params])
+    x = np.asarray(icing_df[feature_params])
     if standardize:
         stdSclr = preprocessing.StandardScaler()
         stdSclr.fit(x)
@@ -100,7 +107,7 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard
         joblib.dump(stdSclr, '/Users/tomrink/stdSclr_4.pkl')
 
     # The dependent variable (target) --------------------------------------------
-    y = np.asarray(icing_df['icing_intensity'])
+    y = np.asarray(icing_df[target_param])
     y = np.where(y == -1, 0, y)
     y = np.where(y >= 1, 1, y)
 
-- 
GitLab