snapshot...

0cd1d076 · tomrink · f0360e57 · 0cd1d076
Commit 0cd1d076 authored 1 year ago by tomrink
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -17,7 +17,6 @@ from sklearn.tree import export_graphviz
 # The independent variables (features) we want to use:
 params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
          'cld_opd_dcomp', 'cld_cwp_dcomp']
-# params = ['supercooled_cloud_fraction', 'cld_temp_acha']


 def metrics(y_true, y_pred, y_pred_prob=None):
@@ -72,7 +71,7 @@ def plot_confusion_matrix(cm, classes,
    plt.xlabel('Predicted label')


-def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
+def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False):
    icing_df = pd.read_csv(csv_file)

    # Random selection of reduce_frac of the rows
@@ -85,11 +84,12 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard
    # Remove this column for now.
    icing_df = icing_df.drop('cld_geo_thick', axis=1)

-    # Remove rows with NaN values
-    # icing_df = icing_df.dropna()
+    print('num obs, features: ', icing_df.shape)
+    if remove_nan:
+        icing_df = icing_df.dropna()
+        print('NaN removed num obs, features: ', icing_df.shape)

    x = np.asarray(icing_df[params])
-    print('num obs, features: ', x.shape)
    if standardize:
        x = preprocessing.StandardScaler().fit(x).transform(x)
        x = np.where(np.isnan(x), 0, x)