diff --git a/modules/machine_learning/__init__.py b/modules/machine_learning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/machine_learning/logistic_regression.py b/modules/machine_learning/logistic_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..2660f4aaec1c16704f52cc985fbc75495f6d3f58 --- /dev/null +++ b/modules/machine_learning/logistic_regression.py @@ -0,0 +1,55 @@ +import pandas as pd +import pylab as pl +import numpy as np +import scipy.optimize as opt +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression + +icing_df = pd.read_csv('/Users/tomrink/train_L2_DAY_1D.csv') +#print(icing_df.head(20)) +#print(icing_df.describe()) +print(icing_df.shape) + +# Remove rows with NaN values +# icing_df = icing_df.dropna() + +#Access rows +print(icing_df.iloc[0]) #First row of DataFrame +print('--------------------------------------') +print(icing_df.iloc[200]) #Eleventh row of DataFrame + +#Access columns +print(icing_df['lwc_dcomp']) #Replace COLUMN_NAME with the name of column + +#Remove column +icing_df = icing_df.drop('lwc_dcomp', axis=1) + +print(icing_df.shape) +# icing_df = icing_df.dropna() +print(icing_df.shape) + +params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp'] +X = np.asarray(icing_df[params]) +# X = preprocessing.StandardScaler().fit(X).transform(X) +y = np.asarray(icing_df['icing_intensity']) +y = np.where(y == -1, 0, y) +print(X.shape, y.shape) +print('num no icing: ', np.sum(y == 0)) +print('num icing: ', np.sum(y == 1)) + +X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) +print ('Train set:', X_train.shape, y_train.shape) +print ('Test set:', X_test.shape, y_test.shape) +X_train = np.where(np.isnan(X_train), 0, X_train) +X_test = np.where(np.isnan(X_test), 0, X_test) +print('num no icing test: ', np.sum(y_test == 0)) +print('num icing test: ', np.sum(y_test == 1)) + + +LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train) +yhat = LR.predict(X_test) +yhat_prob = LR.predict_proba(X_test) +print(confusion_matrix(y_test, yhat, labels=[1,0])) \ No newline at end of file