classification.py

import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree


def get_csv_as_dataframe(csv_file, reduce_frac=None):
    icing_df = pd.read_csv(csv_file)
    # Random selection of reduce_frac of the rows
    if reduce_frac is not None:
        icing_df = icing_df.sample(frac=reduce_frac)
    print(icing_df.describe())
    print(icing_df.shape)
    return icing_df


def get_train_test_data(data_frame, standardize=True):
    icing_df = data_frame
    # The independent variables we want to use:
    params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
              'cld_opd_dcomp', 'iwc_dcomp']
    # Remove this column
    icing_df = icing_df.drop('lwc_dcomp', axis=1)

    # Remove rows with NaN values
    # icing_df = icing_df.dropna()

    print(icing_df.shape)
    # icing_df = icing_df.dropna()
    print(icing_df.shape)

    x = np.asarray(icing_df[params])
    if standardize:
        x = preprocessing.StandardScaler().fit(x).transform(x)
    y = np.asarray(icing_df['icing_intensity'])
    y = np.where(y == -1, 0, y)
    y = np.where(y >= 1, 1, y)
    print(x.shape, y.shape)
    print('num no icing: ', np.sum(y == 0))
    print('num icing: ', np.sum(y == 1))

    return x, y


def logistic_regression(x, y):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
    yhat = LR.predict(x_test)
    yhat_prob = LR.predict_proba(x_test)

    print(confusion_matrix(y_test, yhat, labels=[1,0]))
    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))


def k_nearest_neighbors(x, y, k=4):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
    yhat = KN_C.predict(x_test)
    yhat_prob = KN_C.predict_proba(x_test)

    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))


def decision_tree(x, y, max_depth=4):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    DT = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth).fit(x_train, y_train)
    yhat = DT.predict(x_test)
    yhat_prob = DT.predict_proba(x_test)

    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))