Skip to content
Snippets Groups Projects
Commit 2c4f9cf8 authored by tomrink's avatar tomrink
Browse files

snapshot...

parent f648bd58
No related branches found
No related tags found
No related merge requests found
......@@ -9,11 +9,15 @@ from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
import itertools
import sklearn.tree as tree
from sklearn.tree import export_graphviz
# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
'cld_opd_dcomp', 'cld_cwp_dcomp']
def metrics(y_true, y_pred, y_pred_prob=None):
print(confusion_matrix(y_true, y_pred, labels=[1,0]))
......@@ -67,30 +71,12 @@ def plot_confusion_matrix(cm, classes,
plt.xlabel('Predicted label')
def get_csv_as_dataframe(csv_file, reduce_frac=1.0, random_state=42):
def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True):
icing_df = pd.read_csv(csv_file)
# Random selection of reduce_frac of the rows
icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
# # remove approximately half of rows where column_name equals to column_value
# column_name = 'icing_intensity'
# column_value = -1
# if column_name in icing_df.columns:
# df_to_reduce = icing_df[icing_df[column_name] == column_value]
# icing_df = icing_df[icing_df[column_name] != column_value]
#
# if reduce_frac is not None:
# df_to_reduce = df_to_reduce.sample(axis=0, frac=0.5, random_state=random_state)
#
# icing_df = pd.concat([icing_df, df_to_reduce])
return icing_df
def get_feature_target_data(data_frame, standardize=True):
icing_df = data_frame
# Remove these, more than half seem to be NaN
icing_df = icing_df.drop('lwc_dcomp', axis=1)
icing_df = icing_df.drop('iwc_dcomp', axis=1)
......@@ -98,10 +84,6 @@ def get_feature_target_data(data_frame, standardize=True):
# Remove this column for now.
icing_df = icing_df.drop('cld_geo_thick', axis=1)
# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
'cld_opd_dcomp', 'cld_cwp_dcomp']
# Remove rows with NaN values
# icing_df = icing_df.dropna()
......@@ -138,14 +120,6 @@ def logistic_regression(x_train, y_train, x_test, y_test):
metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print(confusion_matrix(y_test, yhat, labels=[1,0]))
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
......@@ -163,13 +137,6 @@ def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def k_nearest_neighbors_all(x, y, k_s=10):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
......@@ -181,7 +148,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
k_s = 10
mean_acc = np.zeros((k_s - 1))
std_acc = np.zeros((k_s - 1))
......@@ -190,12 +156,6 @@ def k_nearest_neighbors_all(x, y, k_s=10):
yhat = KN_C.predict(x_test)
yhat_prob = KN_C.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
mean_acc[n - 1] = accuracy_score(y_test, yhat)
std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
......@@ -227,16 +187,10 @@ def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
yhat_prob = DT.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print(confusion_matrix(y_test, yhat, labels=[1, 0]))
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
return DT
# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'])
# Use this to plot the tree -----------------------------------------------------------
# export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=params)
# !dot -Tpng tree.dot -o tree.png
......@@ -256,12 +210,6 @@ def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
metrics(y_test, yhat)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
......@@ -278,9 +226,13 @@ def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_dep
yhat_prob = rnd_clf.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
# print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
# print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
# print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
# print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
# print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
# print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbm.fit(x_train, y_train)
yhat = gbm.predict(x_test)
yhat_prob = gbm.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment