classification.py 8.71 KiB
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing, svm
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
import itertools
import sklearn.tree as tree
from sklearn.tree import export_graphviz
# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp']
def metrics(y_true, y_pred, y_pred_prob=None):
print(confusion_matrix(y_true, y_pred, labels=[1,0]))
print('Accuracy: ', "{:.4f}".format(accuracy_score(y_true, y_pred)))
print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_true, y_pred)))
print('Precision: ', "{:.4f}".format(precision_score(y_true, y_pred)))
print('Recall: ', "{:.4f}".format(recall_score(y_true, y_pred)))
print('F1: ', "{:.4f}".format(f1_score(y_true, y_pred)))
if y_pred_prob is not None:
print('AUC: ', "{:.4f}".format(roc_auc_score(y_true, y_pred_prob[:, 1])))
def analyze(dataFrame):
no_icing_df = dataFrame[dataFrame['icing_intensity'] == -1]
icing_df = dataFrame[dataFrame['icing_intensity'] >= 1]
return no_icing_df, icing_df
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False):
icing_df = pd.read_csv(csv_file)
# Random selection of reduce_frac of the rows
icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)
# Remove these, more than half seem to be NaN
icing_df = icing_df.drop('lwc_dcomp', axis=1)
icing_df = icing_df.drop('iwc_dcomp', axis=1)
# Remove this column for now.
icing_df = icing_df.drop('cld_geo_thick', axis=1)
print('num obs, features: ', icing_df.shape)
if remove_nan:
icing_df = icing_df.dropna()
print('NaN removed num obs, features: ', icing_df.shape)
# icing_df = icing_df[icing_df.cld_temp_acha < 273.5]
x = np.asarray(icing_df[params])
if standardize:
x = preprocessing.StandardScaler().fit(x).transform(x)
x = np.where(np.isnan(x), 0, x)
# The dependent variable (target) --------------------------------------------
y = np.asarray(icing_df['icing_intensity'])
y = np.where(y == -1, 0, y)
y = np.where(y >= 1, 1, y)
print('num no icing: ', np.sum(y == 0))
print('num icing: ', np.sum(y == 1))
return x, y
def logistic_regression(x_train, y_train, x_test, y_test):
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
yhat = LR.predict(x_test)
yhat_prob = LR.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
yhat = KN_C.predict(x_test)
yhat_prob = KN_C.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
def k_nearest_neighbors_all(x, y, k_s=10):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
mean_acc = np.zeros((k_s - 1))
std_acc = np.zeros((k_s - 1))
for n in range(1, k_s):
KN_C = KNeighborsClassifier(n_neighbors=n).fit(x_train, y_train)
yhat = KN_C.predict(x_test)
yhat_prob = KN_C.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
mean_acc[n - 1] = accuracy_score(y_test, yhat)
std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])
print("The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax() + 1)
plt.plot(range(1, k_s), mean_acc, 'g')
plt.fill_between(range(1, k_s), mean_acc - 1 * std_acc, mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1, k_s), mean_acc - 3 * std_acc, mean_acc + 3 * std_acc, alpha=0.10, color="green")
plt.legend(('Accuracy ', '+/- 1xstd', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()
def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
DT = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train)
yhat = DT.predict(x_test)
yhat_prob = DT.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
return DT
# Use this to plot the tree -----------------------------------------------------------
# export_graphviz(DT, out_file='tree.dot', filled=True, class_names=['no icing', 'icing'], feature_names=params)
# !dot -Tpng tree.dot -o tree.png
def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
clf = svm.SVC(kernel=kernel)
clf = clf.fit(x_train, y_train)
yhat = clf.predict(x_test)
metrics(y_test, yhat)
def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
rnd_clf = RandomForestClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train)
yhat = rnd_clf.predict(x_test)
yhat_prob = rnd_clf.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)
def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbm.fit(x_train, y_train)
yhat = gbm.predict(x_test)
yhat_prob = gbm.predict_proba(x_test)
metrics(y_test, yhat, y_pred_prob=yhat_prob)