classification.py 4.82 KiB
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree
def get_csv_as_dataframe(csv_file, reduce_frac=None):
icing_df = pd.read_csv(csv_file)
# Random selection of reduce_frac of the rows
if reduce_frac is not None:
icing_df = icing_df.sample(frac=reduce_frac)
print(icing_df.describe())
print(icing_df.shape)
return icing_df
def get_train_test_data(data_frame, standardize=True):
icing_df = data_frame
# The independent variables we want to use:
params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp',
'cld_opd_dcomp', 'iwc_dcomp']
# Remove this column
icing_df = icing_df.drop('lwc_dcomp', axis=1)
# Remove rows with NaN values
# icing_df = icing_df.dropna()
print(icing_df.shape)
# icing_df = icing_df.dropna()
print(icing_df.shape)
x = np.asarray(icing_df[params])
if standardize:
x = preprocessing.StandardScaler().fit(x).transform(x)
y = np.asarray(icing_df['icing_intensity'])
y = np.where(y == -1, 0, y)
y = np.where(y >= 1, 1, y)
print(x.shape, y.shape)
print('num no icing: ', np.sum(y == 0))
print('num icing: ', np.sum(y == 1))
return x, y
def logistic_regression(x, y):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)
yhat = LR.predict(x_test)
yhat_prob = LR.predict_proba(x_test)
print(confusion_matrix(y_test, yhat, labels=[1,0]))
print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def k_nearest_neighbors(x, y, k=4):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train)
yhat = KN_C.predict(x_test)
yhat_prob = KN_C.predict_proba(x_test)
print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))
def decision_tree(x, y, max_depth=4):
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print('Train set:', x_train.shape, y_train.shape)
print('Test set:', x_test.shape, y_test.shape)
x_train = np.where(np.isnan(x_train), 0, x_train)
x_test = np.where(np.isnan(x_test), 0, x_test)
print('num no icing test: ', np.sum(y_test == 0))
print('num icing test: ', np.sum(y_test == 1))
DT = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth).fit(x_train, y_train)
yhat = DT.predict(x_test)
yhat_prob = DT.predict_proba(x_test)
print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat)))
print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat)))
print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat)))
print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat)))
print('F1: ', "{:.4f}".format(f1_score(y_test, yhat)))
print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))