import pandas as pd import pylab as pl import numpy as np import scipy.optimize as opt from sklearn import preprocessing import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier import sklearn.tree as tree def get_csv_as_dataframe(csv_file, reduce_frac=None): icing_df = pd.read_csv(csv_file) # Random selection of reduce_frac of the rows if reduce_frac is not None: icing_df = icing_df.sample(frac=reduce_frac) print(icing_df.describe()) print(icing_df.shape) return icing_df def get_train_test_data(data_frame, standardize=True): icing_df = data_frame # The independent variables we want to use: params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'] # Remove this column icing_df = icing_df.drop('lwc_dcomp', axis=1) # Remove rows with NaN values # icing_df = icing_df.dropna() print(icing_df.shape) # icing_df = icing_df.dropna() print(icing_df.shape) x = np.asarray(icing_df[params]) if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) y = np.asarray(icing_df['icing_intensity']) y = np.where(y == -1, 0, y) y = np.where(y >= 1, 1, y) print(x.shape, y.shape) print('num no icing: ', np.sum(y == 0)) print('num icing: ', np.sum(y == 1)) return x, y def logistic_regression(x, y): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train) yhat = LR.predict(x_test) yhat_prob = LR.predict_proba(x_test) print(confusion_matrix(y_test, yhat, labels=[1,0])) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) def k_nearest_neighbors(x, y, k=4): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train) yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) def decision_tree(x, y, max_depth=4): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) DT = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth).fit(x_train, y_train) yhat = DT.predict(x_test) yhat_prob = DT.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))