From 0cd1d076c7f3a98a4c4ed9f5127fdb922d2b910e Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Thu, 2 May 2024 11:06:45 -0500
Subject: [PATCH] snapshot...

---
 modules/machine_learning/classification.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py
index 3ebf3b8a..c5f97d62 100644
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -17,7 +17,6 @@ from sklearn.tree import export_graphviz
 # The independent variables (features) we want to use:
 params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
           'cld_opd_dcomp', 'cld_cwp_dcomp']
-# params = ['supercooled_cloud_fraction', 'cld_temp_acha']
 
 
 def metrics(y_true, y_pred, y_pred_prob=None):
@@ -72,7 +71,7 @@ def plot_confusion_matrix(cm, classes,
     plt.xlabel('Predicted label')
 
 
-def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
+def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False):
     icing_df = pd.read_csv(csv_file)
 
     # Random selection of reduce_frac of the rows
@@ -85,11 +84,12 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard
     # Remove this column for now.
     icing_df = icing_df.drop('cld_geo_thick', axis=1)
 
-    # Remove rows with NaN values
-    # icing_df = icing_df.dropna()
+    print('num obs, features: ', icing_df.shape)
+    if remove_nan:
+        icing_df = icing_df.dropna()
+        print('NaN removed num obs, features: ', icing_df.shape)
 
     x = np.asarray(icing_df[params])
-    print('num obs, features: ', x.shape)
     if standardize:
         x = preprocessing.StandardScaler().fit(x).transform(x)
         x = np.where(np.isnan(x), 0, x)
-- 
GitLab