snapshot...

2c4f9cf8 · tomrink · f648bd58 · 2c4f9cf8
Commit 2c4f9cf8 authored 1 year ago by tomrink
--- a/modules/machine_learning/classification.py
+++ b/modules/machine_learning/classification.py
@@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
 import itertools
 import sklearn.tree as tree
 from sklearn.tree import export_graphviz
+# The independent variables (features) we want to use:
+params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
+          'cld_opd_dcomp', 'cld_cwp_dcomp']
 def metrics(y_true, y_pred, y_pred_prob=None):
    print(confusion_matrix(y_true, y_pred, labels=[1,0]))
@@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes,
    plt.xlabel('Predicted label')
-def get_csv_as_dataframe(csv_file, reduce_frac=1.0, random_state=42):
+def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
    icing_df = pd.read_csv(csv_file)
    # Random selection of reduce_frac of the rows
    icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
-    # # remove approximately half of rows where column_name equals to column_value
-    # column_name = 'icing_intensity'
-    # column_value = -1
-    # if column_name in icing_df.columns:
-    #     df_to_reduce = icing_df[icing_df[column_name] == column_value]
-    #     icing_df = icing_df[icing_df[column_name] != column_value]
-    #
-    #     if reduce_frac is not None:
-    #         df_to_reduce = df_to_reduce.sample(axis=0, frac=0.5, random_state=random_state)
-    #
-    #     icing_df = pd.concat([icing_df, df_to_reduce])
-    return icing_df
-def get_feature_target_data(data_frame, standardize=True):
-    icing_df = data_frame
    # Remove these, more than half seem to be NaN
    icing_df = icing_df.drop('lwc_dcomp', axis=1)
    icing_df = icing_df.drop('iwc_dcomp', axis=1)
@@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True):
    # Remove this column for now.
    icing_df = icing_df.drop('cld_geo_thick', axis=1)
-    # The independent variables (features) we want to use:
-    params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
-              'cld_opd_dcomp', 'cld_cwp_dcomp']
    # Remove rows with NaN values
    # icing_df = icing_df.dropna()
@@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test):
    metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print(confusion_matrix(y_test, yhat, labels=[1,0]))
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
@@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
    metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
 def k_nearest_neighbors_all(x, y, k_s=10):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
@@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))
-    k_s = 10
    mean_acc = np.zeros((k_s - 1))
    std_acc = np.zeros((k_s - 1))
@@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
        yhat = KN_C.predict(x_test)
        yhat_prob = KN_C.predict_proba(x_test)
        metrics(y_test, yhat, y_pred_prob=yhat_prob)
-        # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-        # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-        # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-        # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-        # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-        # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
        mean_acc[n - 1] = accuracy_score(y_test, yhat)
        std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
@@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
    yhat_prob = DT.predict_proba(x_test)
    metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print(confusion_matrix(y_test, yhat, labels=[1, 0]))
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
    return DT
-# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'])
+# Use this to plot the tree  -----------------------------------------------------------
+# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=params)
 # !dot -Tpng tree.dot -o tree.png
@@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
    metrics(y_test, yhat)
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
 def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
@@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
    yhat_prob = rnd_clf.predict_proba(x_test)
    metrics(y_test, yhat, y_pred_prob=yhat_prob)
-    # print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
-    # print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
-    # print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
+def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):
-    # print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
-    # print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
+    gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
-    # print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
+    gbm.fit(x_train, y_train)
+    yhat = gbm.predict(x_test)
+    yhat_prob = gbm.predict_proba(x_test)
+    metrics(y_test, yhat, y_pred_prob=yhat_prob)