From 2c4f9cf8036ec95d17ec66c79d3deb6e10823544 Mon Sep 17 00:00:00 2001 From: tomrink <rink@ssec.wisc.edu> Date: Thu, 2 May 2024 09:52:02 -0500 Subject: [PATCH] snapshot... --- modules/machine_learning/classification.py | 84 +++++----------------- 1 file changed, 18 insertions(+), 66 deletions(-) diff --git a/modules/machine_learning/classification.py b/modules/machine_learning/classification.py index 79813e32..d5fedef2 100644 --- a/modules/machine_learning/classification.py +++ b/modules/machine_learning/classification.py @@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor import itertools import sklearn.tree as tree from sklearn.tree import export_graphviz +# The independent variables (features) we want to use: +params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', + 'cld_opd_dcomp', 'cld_cwp_dcomp'] + def metrics(y_true, y_pred, y_pred_prob=None): print(confusion_matrix(y_true, y_pred, labels=[1,0])) @@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes, plt.xlabel('Predicted label') -def get_csv_as_dataframe(csv_file, reduce_frac=1.0, random_state=42): +def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True): icing_df = pd.read_csv(csv_file) # Random selection of reduce_frac of the rows icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state) - # # remove approximately half of rows where column_name equals to column_value - # column_name = 'icing_intensity' - # column_value = -1 - # if column_name in icing_df.columns: - # df_to_reduce = icing_df[icing_df[column_name] == column_value] - # icing_df = icing_df[icing_df[column_name] != column_value] - # - # if reduce_frac is not None: - # df_to_reduce = df_to_reduce.sample(axis=0, frac=0.5, random_state=random_state) - # - # icing_df = pd.concat([icing_df, df_to_reduce]) - - return icing_df - - -def get_feature_target_data(data_frame, standardize=True): - icing_df = data_frame - # Remove these, more than half seem to be NaN icing_df = icing_df.drop('lwc_dcomp', axis=1) icing_df = icing_df.drop('iwc_dcomp', axis=1) @@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True): # Remove this column for now. icing_df = icing_df.drop('cld_geo_thick', axis=1) - # The independent variables (features) we want to use: - params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', - 'cld_opd_dcomp', 'cld_cwp_dcomp'] - # Remove rows with NaN values # icing_df = icing_df.dropna() @@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test): metrics(y_test, yhat, y_pred_prob=yhat_prob) - # print(confusion_matrix(y_test, yhat, labels=[1,0])) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) - def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4): @@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4): metrics(y_test, yhat, y_pred_prob=yhat_prob) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) - def k_nearest_neighbors_all(x, y, k_s=10): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) @@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10): print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) - k_s = 10 mean_acc = np.zeros((k_s - 1)) std_acc = np.zeros((k_s - 1)) @@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10): yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) metrics(y_test, yhat, y_pred_prob=yhat_prob) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) mean_acc[n - 1] = accuracy_score(y_test, yhat) std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0]) @@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep yhat_prob = DT.predict_proba(x_test) metrics(y_test, yhat, y_pred_prob=yhat_prob) - # print(confusion_matrix(y_test, yhat, labels=[1, 0])) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) return DT -# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp']) +# Use this to plot the tree ----------------------------------------------------------- +# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=params) # !dot -Tpng tree.dot -o tree.png @@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'): metrics(y_test, yhat) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4): @@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep yhat_prob = rnd_clf.predict_proba(x_test) metrics(y_test, yhat, y_pred_prob=yhat_prob) - # print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) - # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) - # print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) - # print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) - # print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) - # print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) + + +def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1): + + gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3) + gbm.fit(x_train, y_train) + yhat = gbm.predict(x_test) + yhat_prob = gbm.predict_proba(x_test) + + metrics(y_test, yhat, y_pred_prob=yhat_prob) -- GitLab