Skip to content
Snippets Groups Projects
Commit 2c4f9cf8 authored by tomrink's avatar tomrink
Browse files

snapshot...

parent f648bd58
No related branches found
No related tags found
No related merge requests found
...@@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split ...@@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
import itertools import itertools
import sklearn.tree as tree import sklearn.tree as tree
from sklearn.tree import export_graphviz from sklearn.tree import export_graphviz
# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
'cld_opd_dcomp', 'cld_cwp_dcomp']
def metrics(y_true, y_pred, y_pred_prob=None): def metrics(y_true, y_pred, y_pred_prob=None):
print(confusion_matrix(y_true, y_pred, labels=[1,0])) print(confusion_matrix(y_true, y_pred, labels=[1,0]))
...@@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes, ...@@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes,
plt.xlabel('Predicted label') plt.xlabel('Predicted label')
def get_csv_as_dataframe(csv_file, reduce_frac=1.0, random_state=42): def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
icing_df = pd.read_csv(csv_file) icing_df = pd.read_csv(csv_file)
# Random selection of reduce_frac of the rows # Random selection of reduce_frac of the rows
icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state) icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
# # remove approximately half of rows where column_name equals to column_value
# column_name = 'icing_intensity'
# column_value = -1
# if column_name in icing_df.columns:
# df_to_reduce = icing_df[icing_df[column_name] == column_value]
# icing_df = icing_df[icing_df[column_name] != column_value]
#
# if reduce_frac is not None:
# df_to_reduce = df_to_reduce.sample(axis=0, frac=0.5, random_state=random_state)
#
# icing_df = pd.concat([icing_df, df_to_reduce])
return icing_df
def get_feature_target_data(data_frame, standardize=True):
icing_df = data_frame
# Remove these, more than half seem to be NaN # Remove these, more than half seem to be NaN
icing_df = icing_df.drop('lwc_dcomp', axis=1) icing_df = icing_df.drop('lwc_dcomp', axis=1)
icing_df = icing_df.drop('iwc_dcomp', axis=1) icing_df = icing_df.drop('iwc_dcomp', axis=1)
...@@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True): ...@@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True):
# Remove this column for now. # Remove this column for now.
icing_df = icing_df.drop('cld_geo_thick', axis=1) icing_df = icing_df.drop('cld_geo_thick', axis=1)
# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
'cld_opd_dcomp', 'cld_cwp_dcomp']
# Remove rows with NaN values # Remove rows with NaN values
# icing_df = icing_df.dropna() # icing_df = icing_df.dropna()
...@@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test): ...@@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test):
metrics(y_test, yhat, y_pred_prob=yhat_prob) metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print(confusion_matrix(y_test, yhat, labels=[1,0]))
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4): def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
...@@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4): ...@@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
metrics(y_test, yhat, y_pred_prob=yhat_prob) metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def k_nearest_neighbors_all(x, y, k_s=10): def k_nearest_neighbors_all(x, y, k_s=10):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
...@@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10): ...@@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
print('num no icing test: ', np.sum(y_test == 0)) print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1)) print('num icing test: ', np.sum(y_test == 1))
k_s = 10
mean_acc = np.zeros((k_s - 1)) mean_acc = np.zeros((k_s - 1))
std_acc = np.zeros((k_s - 1)) std_acc = np.zeros((k_s - 1))
...@@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10): ...@@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
yhat = KN_C.predict(x_test) yhat = KN_C.predict(x_test)
yhat_prob = KN_C.predict_proba(x_test) yhat_prob = KN_C.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob) metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
mean_acc[n - 1] = accuracy_score(y_test, yhat) mean_acc[n - 1] = accuracy_score(y_test, yhat)
std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0]) std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
...@@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep ...@@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
yhat_prob = DT.predict_proba(x_test) yhat_prob = DT.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob) metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print(confusion_matrix(y_test, yhat, labels=[1, 0]))
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
return DT return DT
# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp']) # Use this to plot the tree -----------------------------------------------------------
# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=params)
# !dot -Tpng tree.dot -o tree.png # !dot -Tpng tree.dot -o tree.png
...@@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'): ...@@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
metrics(y_test, yhat) metrics(y_test, yhat)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4): def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
...@@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep ...@@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
yhat_prob = rnd_clf.predict_proba(x_test) yhat_prob = rnd_clf.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob) metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) gbm.fit(x_train, y_train)
yhat = gbm.predict(x_test)
yhat_prob = gbm.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment