Skip to content
Snippets Groups Projects
classification.py 4.82 KiB
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree


def get_csv_as_dataframe(csv_file, reduce_frac=None):
    icing_df = pd.read_csv(csv_file)
    # Random selection of reduce_frac of the rows
    if reduce_frac is not None:
        icing_df = icing_df.sample(frac=reduce_frac)
    print(icing_df.describe())
    print(icing_df.shape)
    return icing_df


def get_train_test_data(data_frame, standardize=True):
    icing_df = data_frame
    # The independent variables we want to use:
    params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
              'cld_opd_dcomp', 'iwc_dcomp']
    # Remove this column
    icing_df = icing_df.drop('lwc_dcomp', axis=1)

    # Remove rows with NaN values
    # icing_df = icing_df.dropna()

    print(icing_df.shape)
    # icing_df = icing_df.dropna()
    print(icing_df.shape)

    x = np.asarray(icing_df[params])
    if standardize:
        x = preprocessing.StandardScaler().fit(x).transform(x)
    y = np.asarray(icing_df['icing_intensity'])
    y = np.where(y == -1, 0, y)
    y = np.where(y >= 1, 1, y)
    print(x.shape, y.shape)
    print('num no icing: ', np.sum(y == 0))
    print('num icing: ', np.sum(y == 1))

    return x, y


def logistic_regression(x, y):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
    yhat = LR.predict(x_test)
    yhat_prob = LR.predict_proba(x_test)

    print(confusion_matrix(y_test, yhat, labels=[1,0]))
    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))


def k_nearest_neighbors(x, y, k=4):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
    yhat = KN_C.predict(x_test)
    yhat_prob = KN_C.predict_proba(x_test)

    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))


def decision_tree(x, y, max_depth=4):
    x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
    print('Train set:', x_train.shape,  y_train.shape)
    print('Test set:', x_test.shape,  y_test.shape)

    x_train = np.where(np.isnan(x_train), 0, x_train)
    x_test = np.where(np.isnan(x_test), 0, x_test)
    print('num no icing test: ', np.sum(y_test == 0))
    print('num icing test: ', np.sum(y_test == 1))

    DT = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth).fit(x_train, y_train)
    yhat = DT.predict(x_test)
    yhat_prob = DT.predict_proba(x_test)

    print('Accuracy:    ', "{:.4f}".format(accuracy_score(y_test, yhat)))
    print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
    print('Precision:   ', "{:.4f}".format(precision_score(y_test, yhat)))
    print('Recall:      ', "{:.4f}".format(recall_score(y_test, yhat)))
    print('F1:          ', "{:.4f}".format(f1_score(y_test, yhat)))
    print('AUC:         ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))