import pandas as pd import pylab as pl import numpy as np import scipy.optimize as opt from sklearn import preprocessing, svm import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, accuracy_score, jaccard_score, f1_score, precision_score, recall_score, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier import sklearn.tree as tree from sklearn.tree import export_graphviz def get_csv_as_dataframe(csv_file, reduce_frac=None): icing_df = pd.read_csv(csv_file) # Random selection of reduce_frac of the rows if reduce_frac is not None: icing_df = icing_df.sample(frac=reduce_frac) print(icing_df.describe()) print(icing_df.shape) return icing_df def get_train_test_data(data_frame, standardize=True): icing_df = data_frame # The independent variables we want to use: params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'] # Remove this column icing_df = icing_df.drop('lwc_dcomp', axis=1) # Remove rows with NaN values # icing_df = icing_df.dropna() print(icing_df.shape) # icing_df = icing_df.dropna() print(icing_df.shape) x = np.asarray(icing_df[params]) if standardize: x = preprocessing.StandardScaler().fit(x).transform(x) y = np.asarray(icing_df['icing_intensity']) y = np.where(y == -1, 0, y) y = np.where(y >= 1, 1, y) print(x.shape, y.shape) print('num no icing: ', np.sum(y == 0)) print('num icing: ', np.sum(y == 1)) return x, y def logistic_regression(x, y): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) LR = LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train) yhat = LR.predict(x_test) yhat_prob = LR.predict_proba(x_test) print(confusion_matrix(y_test, yhat, labels=[1,0])) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) def k_nearest_neighbors(x, y, k=4): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) KN_C = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train) yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) def k_nearest_neighbors_all(x, y, k_s=10): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) k_s = 10 mean_acc = np.zeros((k_s - 1)) std_acc = np.zeros((k_s - 1)) for n in range(1, k_s): KN_C = KNeighborsClassifier(n_neighbors=n).fit(x_train, y_train) yhat = KN_C.predict(x_test) yhat_prob = KN_C.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) mean_acc[n - 1] = accuracy_score(y_test, yhat) std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0]) print("The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax() + 1) plt.plot(range(1, k_s), mean_acc, 'g') plt.fill_between(range(1, k_s), mean_acc - 1 * std_acc, mean_acc + 1 * std_acc, alpha=0.10) plt.fill_between(range(1, k_s), mean_acc - 3 * std_acc, mean_acc + 3 * std_acc, alpha=0.10, color="green") plt.legend(('Accuracy ', '+/- 1xstd', '+/- 3xstd')) plt.ylabel('Accuracy ') plt.xlabel('Number of Neighbors (K)') plt.tight_layout() plt.show() def decision_tree(x, y, criterion='entropy', max_depth=4): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) DT = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train) yhat = DT.predict(x_test) yhat_prob = DT.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1]))) return DT # export_graphviz(DT, out_file='tree.dot', filled=True, feature_names=['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp']) # !dot -Tpng tree.dot -o tree.png def SVM(x, y, kernel='rbf'): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) clf = svm.SVC(kernel=kernel) clf = clf.fit(x_train, y_train) yhat = clf.predict(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) def random_forest(x, y, criterion='entropy', max_depth=4): x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4) print('Train set:', x_train.shape, y_train.shape) print('Test set:', x_test.shape, y_test.shape) x_train = np.where(np.isnan(x_train), 0, x_train) x_test = np.where(np.isnan(x_test), 0, x_test) print('num no icing test: ', np.sum(y_test == 0)) print('num icing test: ', np.sum(y_test == 1)) rnd_clf = RandomForestClassifier(criterion=criterion, max_depth=max_depth).fit(x_train, y_train) yhat = rnd_clf.predict(x_test) yhat_prob = rnd_clf.predict_proba(x_test) print('Accuracy: ', "{:.4f}".format(accuracy_score(y_test, yhat))) print('Jaccard Idx: ', "{:.4f}".format(jaccard_score(y_test, yhat))) print('Precision: ', "{:.4f}".format(precision_score(y_test, yhat))) print('Recall: ', "{:.4f}".format(recall_score(y_test, yhat))) print('F1: ', "{:.4f}".format(f1_score(y_test, yhat))) print('AUC: ', "{:.4f}".format(roc_auc_score(y_test, yhat_prob[:, 1])))