diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 72f2227eb1cfdf2eff246c69b316d1f73ee69f67..263ebc8c5c7a512a7fd6eed5d426e3894ef88e72 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -81,14 +81,17 @@ def get_csv_as_dataframe(csv_file, reduce_frac=None, random_state=42): def get_feature_target_data(data_frame, standardize=True): icing_df = data_frame - # The independent variables (features) we want to use: - params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', - 'cld_opd_dcomp', 'iwc_dcomp', 'cld_cwp_dcomp'] - # Remove this column + # Remove these, more than half seem to be NaN icing_df = icing_df.drop('lwc_dcomp', axis=1) - # Remove this column + icing_df = icing_df.drop('iwc_dcomp', axis=1) + + # Remove this column for now. icing_df = icing_df.drop('cld_geo_thick', axis=1) + # The independent variables (features) we want to use: + params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', + 'cld_opd_dcomp', 'cld_cwp_dcomp'] + # Remove rows with NaN values # icing_df = icing_df.dropna() @@ -97,7 +100,7 @@ def get_feature_target_data(data_frame, standardize=True): if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) - # The dependent variable (target) ------------------------------ + # The dependent variable (target) -------------------------------------------- y = np.asarray(icing_df['icing_intensity']) y = np.where(y == -1, 0, y) y = np.where(y >= 1, 1, y)