From 99573c5494efcd10b07ab2a6b0e90b40623cecbd Mon Sep 17 00:00:00 2001 From: tomrink <rink@ssec.wisc.edu> Date: Thu, 16 May 2024 09:48:56 -0500 Subject: [PATCH] snapshot... --- modules/machine_learning/classification.py | 27 ++++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 8b8cfcc6..18040b7a 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -15,8 +15,14 @@ import sklearn.tree as tree from sklearn.tree import export_graphviz # The independent variables (features) we want to use: -# params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp'] -params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp'] +# feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp'] +# feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp'] +feature_params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_acha', 'cld_opd_acha'] + +# The dependent variable (target) +target_param = 'icing_intensity' + +params = feature_params + [target_param] def metrics(y_true, y_pred, y_pred_prob=None): @@ -74,15 +80,16 @@ def plot_confusion_matrix(cm, classes, def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False): icing_df = pd.read_csv(csv_file) + icing_df = icing_df[params] + # Random selection of reduce_frac of the rows icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state) - # Remove these, more than half seem to be NaN - icing_df = icing_df.drop('lwc_dcomp', axis=1) - icing_df = icing_df.drop('iwc_dcomp', axis=1) - - # Remove this column for now. - icing_df = icing_df.drop('cld_geo_thick', axis=1) + # # Remove these, more than half seem to be NaN + # icing_df = icing_df.drop('lwc_dcomp', axis=1) + # icing_df = icing_df.drop('iwc_dcomp', axis=1) + # # Remove this column for now. + # icing_df = icing_df.drop('cld_geo_thick', axis=1) print('num obs, features: ', icing_df.shape) if remove_nan: @@ -91,7 +98,7 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard # icing_df = icing_df[icing_df.cld_temp_acha < 273.10] - x = np.asarray(icing_df[params]) + x = np.asarray(icing_df[feature_params]) if standardize: stdSclr = preprocessing.StandardScaler() stdSclr.fit(x) @@ -100,7 +107,7 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard joblib.dump(stdSclr, '/Users/tomrink/stdSclr_4.pkl') # The dependent variable (target) -------------------------------------------- - y = np.asarray(icing_df['icing_intensity']) + y = np.asarray(icing_df[target_param]) y = np.where(y == -1, 0, y) y = np.where(y >= 1, 1, y) -- GitLab