Skip to content
Snippets Groups Projects
classification.py 8.71 KiB
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing, svm
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
import itertools
import sklearn.tree as tree
from sklearn.tree import export_graphviz

# The independent variables (features) we want to use:
params = ['cld_temp_acha', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp']


def metrics(y_true, y_pred, y_pred_prob=None):
    print(confusion_matrix(y_true, y_pred, labels=[1,0]))
    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_true, y_pred)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_true, y_pred)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_true, y_pred)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_true, y_pred)))
    print('F1:          ', "{:.4f}".format(f1_score(y_true, y_pred)))
    if y_pred_prob is not None:
        print('AUC:         ', "{:.4f}".format(roc_auc_score(y_true, y_pred_prob[:, 1])))


def analyze(dataFrame):
    no_icing_df = dataFrame[dataFrame['icing_intensity'] == -1]
    icing_df = dataFrame[dataFrame['icing_intensity'] >= 1]
    return no_icing_df, icing_df


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def get_feature_target_data(csv_file, reduce_frac=1.0, random_state=42, standardize=True, remove_nan=False):
    icing_df = pd.read_csv(csv_file)

    # Random selection of reduce_frac of the rows
    icing_df = icing_df.sample(axis=0, frac=reduce_frac, random_state=random_state)

    # Remove these, more than half seem to be NaN
    icing_df = icing_df.drop('lwc_dcomp', axis=1)
    icing_df = icing_df.drop('iwc_dcomp', axis=1)

    # Remove this column for now.
    icing_df = icing_df.drop('cld_geo_thick', axis=1)

    print('num obs, features: ', icing_df.shape)
    if remove_nan:
        icing_df = icing_df.dropna()
        print('NaN removed num obs, features: ', icing_df.shape)

    # icing_df = icing_df[icing_df.cld_temp_acha < 273.5]

    x = np.asarray(icing_df[params])
    if standardize:
        x = preprocessing.StandardScaler().fit(x).transform(x)
        x = np.where(np.isnan(x), 0, x)

    # The dependent variable (target) --------------------------------------------
    y = np.asarray(icing_df['icing_intensity'])
    y = np.where(y == -1, 0, y)
    y = np.where(y >= 1, 1, y)

    print('num no icing: ', np.sum(y == 0))
    print('num icing: ', np.sum(y == 1))

    return x, y


def logistic_regression(x_train, y_train, x_test, y_test):

    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
    yhat = LR.predict(x_test)
    yhat_prob = LR.predict_proba(x_test)

    metrics(y_test, yhat, y_pred_prob=yhat_prob)


def k_nearest_neighbors(x_train, y_train, x_test, y_test, k=4):

    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
    yhat = KN_C.predict(x_test)
    yhat_prob = KN_C.predict_proba(x_test)

    metrics(y_test, yhat, y_pred_prob=yhat_prob)


def k_nearest_neighbors_all(x, y, k_s=10):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    mean_acc = np.zeros((k_s - 1))
    std_acc = np.zeros((k_s - 1))

    for n in range(1, k_s):
        KN_C = KNeighborsClassifier(n_neighbors=n).fit(x_train, y_train)
        yhat = KN_C.predict(x_test)
        yhat_prob = KN_C.predict_proba(x_test)
        metrics(y_test, yhat, y_pred_prob=yhat_prob)

        mean_acc[n - 1] = accuracy_score(y_test, yhat)
        std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])

    print("The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax() + 1)

    plt.plot(range(1, k_s), mean_acc, 'g')
    plt.fill_between(range(1, k_s), mean_acc - 1 * std_acc, mean_acc + 1 * std_acc, alpha=0.10)
    plt.fill_between(range(1, k_s), mean_acc - 3 * std_acc, mean_acc + 3 * std_acc, alpha=0.10, color="green")
    plt.legend(('Accuracy ', '+/- 1xstd', '+/- 3xstd'))
    plt.ylabel('Accuracy ')
    plt.xlabel('Number of Neighbors (K)')
    plt.tight_layout()
    plt.show()


def decision_tree(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):

    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    DT = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train)
    yhat = DT.predict(x_test)
    yhat_prob = DT.predict_proba(x_test)

    metrics(y_test, yhat, y_pred_prob=yhat_prob)

    return DT
# Use this to plot the tree  -----------------------------------------------------------
# export_graphviz(DT, out_file='tree.dot', filled=True, class_names=['no icing', 'icing'], feature_names=params)
# !dot -Tpng tree.dot -o tree.png


def SVM(x_train, y_train, x_test, y_test, kernel='rbf'):

    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    clf = svm.SVC(kernel=kernel)
    clf = clf.fit(x_train, y_train)
    yhat = clf.predict(x_test)

    metrics(y_test, yhat)


def random_forest(x_train, y_train, x_test, y_test, criterion='entropy', max_depth=4):

    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    rnd_clf = RandomForestClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train)
    yhat = rnd_clf.predict(x_test)
    yhat_prob = rnd_clf.predict_proba(x_test)

    metrics(y_test, yhat, y_pred_prob=yhat_prob)


def gradient_boosting(x_train, y_train, x_test, y_test, n_estimators=100, max_depth=3, learning_rate=0.1):

    gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
    gbm.fit(x_train, y_train)
    yhat = gbm.predict(x_test)
    yhat_prob = gbm.predict_proba(x_test)

    metrics(y_test, yhat, y_pred_prob=yhat_prob)