From 6acaf5c248e6c56565b049815c145b4b2ca37f0b Mon Sep 17 00:00:00 2001
From: Coda Phillips <cphillips@sean.ssec.wisc.edu>
Date: Mon, 11 Jul 2016 11:29:02 -0500
Subject: [PATCH] Update bb outlier detection algorithm

---
 electronic_checks.py | 49 +++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/electronic_checks.py b/electronic_checks.py
index b5b5bc1..b954bbd 100644
--- a/electronic_checks.py
+++ b/electronic_checks.py
@@ -6,11 +6,16 @@ def find_bb_outliers(frame, parameters, bb):
     if not np.in1d(['{}bottomTemp'.format(bb),'{}apexTemp'.format(bb),'{}topTemp'.format(bb)], frame.columns).all():
         return frame
 
-    window_length = parameters.get('window_length', 100)
+    window_length = parameters.get('window_length', 50)
 
-    bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length)
-    bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length)
-    bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length)
+    if bb == 'HBB':
+        bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length)
+        bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length)
+        bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length)
+    else:
+        bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)
+        bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)
+        bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)
 
     variable_qcs = pd.DataFrame({
         'qc_{}bottomTemp'.format(bb) : bbb_outliers * 1,
@@ -37,9 +42,9 @@ def calibrationambienttemp_outlier_check(frame, parameters):
     if 'calibrationAmbientTemp' not in frame.columns:
         return frame
     
-    window_length = parameters.get('window_length', 100)
+    window_length = parameters.get('window_length', 50)
 
-    temp_outliers = _find_6sigma_outliers(frame['calibrationAmbientTemp'], window_length, use_mean=True)
+    temp_outliers = _find_6sigma_outliers(frame['calibrationAmbientTemp'], window_length)
     frame['calibrationambienttemp_outlier_check'] = temp_outliers * 1
     frame = update_variable_qc(frame, pd.DataFrame({'qc_calibrationAmbientTemp':temp_outliers*1}, index=frame.index))
     annotate_all(frame, temp_outliers, 'calibrationAmbientTemp outlier')
@@ -48,23 +53,25 @@ def calibrationambienttemp_outlier_check(frame, parameters):
 class CheckList(BaseCheckList):
     checks = [ hbb_temp_outlier_check , abb_temp_outlier_check, calibrationambienttemp_outlier_check ]
 
-def _compute_robust_zscore(frame, window_length, use_mean=False):
-    use_mean = False
-    
-    if use_mean:
-        robust_rolling_std = frame.rolling(window=window_length, center=True, min_periods=1).std()
-        return abs((frame - frame.rolling(window=window_length, center=True, min_periods=1).mean()) / robust_rolling_std)
-    else:
-        # Compute a centered rolling MAD over window_length
-        rolling_mad = abs(frame - frame.rolling(window=window_length, center=True, min_periods=1).median()
-                     ).rolling(window=window_length, center=True, min_periods=1).median()
-        # standard deviation is proportional to median absolute deviation I'm told
-        robust_rolling_std = rolling_mad * 1.48
-        return abs((frame - frame.rolling(window=window_length, center=True, min_periods=1).median()) / robust_rolling_std)
+def _compute_robust_zscore(frame, window_length):
+    median_values = frame.rolling(window=window_length, center=True, min_periods=1).median()
+    # Compute the MAD
+    mad = abs(frame - median_values).median()
+    # standard deviation is proportional to median absolute deviation I'm told
+    robust_std = mad * 1.48
+    # compute the Mahalanobis distance from rolling median
+    return abs((frame - median_values) / robust_std)
+
+def _compute_robust_rate_zscore(frame, window_length=None):
+    time_diffs = pd.Series((frame.index.values[1:] - frame.index.values[:-1]).astype(np.int64),
+                           index=frame.index[1:])
+    changes = frame.diff() / time_diffs
+    mad_diff = abs(changes - changes.median()).median() * 1.48
+    return abs(frame.diff() / time_diffs)
     
-def _find_6sigma_outliers(frame, window_length, use_mean=False):
+def _find_6sigma_outliers(frame, window_length, estimation_func=_compute_robust_zscore):
     # Find outliers with deviation greater than 6 sigma
-    outlier_mask = _compute_robust_zscore(frame, window_length, use_mean) > 6
+    outlier_mask = estimation_func(frame, window_length) > 6
     return outlier_mask
 
 
-- 
GitLab