diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 9ab5ed3171333eecbfbbc891699c2a153899209a..814af787a6a38b53110691c41780326f9c6cc097 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -60,9 +60,10 @@ def get_csv_as_dataframe(csv_file, reduce_frac=None): return icing_df -def get_train_test_data(data_frame, standardize=True): +def get_feature_target_data(data_frame, standardize=True): icing_df = data_frame - # The independent variables we want to use: + + # The independent variables (features) we want to use: params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'] # Remove this column @@ -71,17 +72,17 @@ def get_train_test_data(data_frame, standardize=True): # Remove rows with NaN values # icing_df = icing_df.dropna() - print(icing_df.shape) - # icing_df = icing_df.dropna() - print(icing_df.shape) + print('num obs, features: ', icing_df.shape) x = np.asarray(icing_df[params]) if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) + + # The dependent variable (target) ------------------------------ y = np.asarray(icing_df['icing_intensity']) y = np.where(y == -1, 0, y) y = np.where(y >= 1, 1, y) - print(x.shape, y.shape) + print('num no icing: ', np.sum(y == 0)) print('num icing: ', np.sum(y == 1))