diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 3ebf3b8a7a6055feaf0bc53d4361fc04d4151646..c5f97d628e56ece39f0634e1ac8e78aea6eef977 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -17,7 +17,6 @@ from sklearn.tree import export_graphviz # The independent variables (features) we want to use: params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp'] -# params = ['supercooled_cloud_fraction', 'cld_temp_acha'] def metrics(y_true, y_pred, y_pred_prob=None): @@ -72,7 +71,7 @@ def plot_confusion_matrix(cm, classes, plt.xlabel('Predicted label') -def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True): +def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False): icing_df = pd.read_csv(csv_file) # Random selection of reduce_frac of the rows @@ -85,11 +84,12 @@ def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standard # Remove this column for now. icing_df = icing_df.drop('cld_geo_thick', axis=1) - # Remove rows with NaN values - # icing_df = icing_df.dropna() + print('num obs, features: ', icing_df.shape) + if remove_nan: + icing_df = icing_df.dropna() + print('NaN removed num obs, features: ', icing_df.shape) x = np.asarray(icing_df[params]) - print('num obs, features: ', x.shape) if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) x = np.where(np.isnan(x), 0, x)