From 7d7f4e5a5ade341cc41c7e8368ecd4992188b928 Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Tue, 23 Apr 2024 16:28:54 -0500
Subject: [PATCH] snapshot...

---
 modules/machine_learning/__init__.py          |  0
 .../machine_learning/logistic_regression.py   | 55 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 modules/machine_learning/__init__.py
 create mode 100644 modules/machine_learning/logistic_regression.py

diff --git a/modules/machine_learning/__init__.py b/modules/machine_learning/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modules/machine_learning/logistic_regression.py b/modules/machine_learning/logistic_regression.py
new file mode 100644
index 00000000..2660f4aa
--- /dev/null
+++ b/modules/machine_learning/logistic_regression.py
@@ -0,0 +1,55 @@
+import pandas as pd
+import pylab as pl
+import numpy as np
+import scipy.optimize as opt
+from sklearn import preprocessing
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+
+icing_df = pd.read_csv('/Users/tomrink/train_L2_DAY_1D.csv')
+#print(icing_df.head(20))
+#print(icing_df.describe())
+print(icing_df.shape)
+
+# Remove rows with NaN values
+# icing_df = icing_df.dropna()
+
+#Access rows
+print(icing_df.iloc[0]) #First row of DataFrame
+print('--------------------------------------')
+print(icing_df.iloc[200]) #Eleventh row of DataFrame
+
+#Access columns
+print(icing_df['lwc_dcomp']) #Replace COLUMN_NAME with the name of column
+
+#Remove column
+icing_df = icing_df.drop('lwc_dcomp', axis=1)
+
+print(icing_df.shape)
+# icing_df = icing_df.dropna()
+print(icing_df.shape)
+
+params = ['cld_geo_thick', 'cld_temp_acha', 'conv_cloud_fraction', 'supercooled_cloud_fraction', 'cld_reff_dcomp', 'cld_opd_dcomp', 'iwc_dcomp']
+X = np.asarray(icing_df[params])
+# X = preprocessing.StandardScaler().fit(X).transform(X)
+y = np.asarray(icing_df['icing_intensity'])
+y = np.where(y == -1, 0, y)
+print(X.shape, y.shape)
+print('num no icing: ', np.sum(y == 0))
+print('num icing: ', np.sum(y == 1))
+
+X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
+print ('Train set:', X_train.shape,  y_train.shape)
+print ('Test set:', X_test.shape,  y_test.shape)
+X_train = np.where(np.isnan(X_train), 0, X_train)
+X_test = np.where(np.isnan(X_test), 0, X_test)
+print('num no icing test: ', np.sum(y_test == 0))
+print('num icing test: ', np.sum(y_test == 1))
+
+
+LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
+yhat = LR.predict(X_test)
+yhat_prob = LR.predict_proba(X_test)
+print(confusion_matrix(y_test, yhat, labels=[1,0]))
\ No newline at end of file
-- 
GitLab